From 1e5a50b71d8f0eae6007bedc329eecb24bb5aba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs?= Date: Wed, 6 Apr 2022 03:37:17 +0800 Subject: update from upstream --- AUTHORS | 1136 ++++++++ CONTRIBUTORS | 91 +- MANIFEST.in | 1 + bin/hypervideo | 0 completions/zsh/_hypervideo | 30 + devscripts/make_lazy_extractors.py | 13 +- devscripts/make_supportedsites.py | 8 +- devscripts/prepare_manpage.py | 90 +- hypervideo_dl/YoutubeDL.py | 1772 +++++++----- hypervideo_dl/__init__.py | 856 +++--- hypervideo_dl/__main__.py | 0 hypervideo_dl/aes.py | 60 +- hypervideo_dl/compat.py | 40 +- hypervideo_dl/cookies.py | 356 ++- hypervideo_dl/downloader/__init__.py | 17 +- hypervideo_dl/downloader/common.py | 99 +- hypervideo_dl/downloader/dash.py | 68 +- hypervideo_dl/downloader/external.py | 102 +- hypervideo_dl/downloader/f4m.py | 2 +- hypervideo_dl/downloader/fc2.py | 41 + hypervideo_dl/downloader/fragment.py | 171 +- hypervideo_dl/downloader/hls.py | 9 + hypervideo_dl/downloader/http.py | 130 +- hypervideo_dl/downloader/ism.py | 4 +- hypervideo_dl/downloader/mhtml.py | 13 +- hypervideo_dl/downloader/rtmp.py | 3 +- hypervideo_dl/downloader/websocket.py | 7 +- hypervideo_dl/downloader/youtube_live_chat.py | 9 +- hypervideo_dl/extractor/__init__.py | 21 +- hypervideo_dl/extractor/abc.py | 67 +- hypervideo_dl/extractor/abematv.py | 476 +++ hypervideo_dl/extractor/adn.py | 30 +- hypervideo_dl/extractor/adobeconnect.py | 4 +- hypervideo_dl/extractor/adobepass.py | 61 +- hypervideo_dl/extractor/adobetv.py | 3 +- hypervideo_dl/extractor/afreecatv.py | 120 +- hypervideo_dl/extractor/aliexpress.py | 2 +- hypervideo_dl/extractor/aljazeera.py | 87 +- hypervideo_dl/extractor/allocine.py | 6 +- hypervideo_dl/extractor/alsace20tv.py | 87 + hypervideo_dl/extractor/alura.py | 9 +- hypervideo_dl/extractor/amazon.py | 53 + hypervideo_dl/extractor/animelab.py | 35 +- hypervideo_dl/extractor/animeondemand.py | 31 +- hypervideo_dl/extractor/ant1newsgr.py | 143 + hypervideo_dl/extractor/anvato.py | 7 +- hypervideo_dl/extractor/aparat.py | 15 +- hypervideo_dl/extractor/applepodcasts.py | 48 +- hypervideo_dl/extractor/archiveorg.py | 512 +++- hypervideo_dl/extractor/arcpublishing.py | 5 +- hypervideo_dl/extractor/ard.py | 76 +- hypervideo_dl/extractor/arnes.py | 3 +- hypervideo_dl/extractor/arte.py | 50 +- hypervideo_dl/extractor/asiancrush.py | 3 +- hypervideo_dl/extractor/atresplayer.py | 12 +- hypervideo_dl/extractor/atvat.py | 6 + hypervideo_dl/extractor/audiomack.py | 35 +- hypervideo_dl/extractor/awaan.py | 5 +- hypervideo_dl/extractor/azmedien.py | 10 +- hypervideo_dl/extractor/banbye.py | 153 + hypervideo_dl/extractor/bandaichannel.py | 1 - hypervideo_dl/extractor/bandcamp.py | 69 +- hypervideo_dl/extractor/bbc.py | 89 +- hypervideo_dl/extractor/beeg.py | 123 +- hypervideo_dl/extractor/bigo.py | 59 + hypervideo_dl/extractor/bilibili.py | 405 ++- hypervideo_dl/extractor/biqle.py | 93 +- hypervideo_dl/extractor/bitwave.py | 2 +- hypervideo_dl/extractor/blogger.py | 54 + hypervideo_dl/extractor/bongacams.py | 2 +- hypervideo_dl/extractor/br.py | 5 +- hypervideo_dl/extractor/breitbart.py | 38 + hypervideo_dl/extractor/brightcove.py | 40 +- hypervideo_dl/extractor/cableav.py | 34 + hypervideo_dl/extractor/callin.py | 114 + hypervideo_dl/extractor/caltrans.py | 41 + hypervideo_dl/extractor/cam4.py | 5 +- hypervideo_dl/extractor/cammodels.py | 2 +- hypervideo_dl/extractor/canalalpha.py | 98 + hypervideo_dl/extractor/canvas.py | 68 +- hypervideo_dl/extractor/carambatv.py | 3 +- hypervideo_dl/extractor/cbc.py | 182 +- hypervideo_dl/extractor/cbs.py | 28 +- hypervideo_dl/extractor/ccma.py | 13 +- hypervideo_dl/extractor/cctv.py | 3 +- hypervideo_dl/extractor/ceskatelevize.py | 130 +- hypervideo_dl/extractor/chaturbate.py | 2 +- hypervideo_dl/extractor/chingari.py | 4 +- hypervideo_dl/extractor/closertotruth.py | 3 +- hypervideo_dl/extractor/common.py | 468 ++- hypervideo_dl/extractor/corus.py | 1 - hypervideo_dl/extractor/coub.py | 3 +- hypervideo_dl/extractor/cozytv.py | 40 + hypervideo_dl/extractor/cpac.py | 148 + hypervideo_dl/extractor/crackle.py | 40 +- hypervideo_dl/extractor/craftsy.py | 71 + hypervideo_dl/extractor/crowdbunker.py | 113 + hypervideo_dl/extractor/crunchyroll.py | 359 ++- hypervideo_dl/extractor/cspan.py | 52 +- hypervideo_dl/extractor/ctvnews.py | 5 + hypervideo_dl/extractor/curiositystream.py | 84 +- hypervideo_dl/extractor/cybrary.py | 146 + hypervideo_dl/extractor/daftsex.py | 146 + hypervideo_dl/extractor/dailymotion.py | 33 +- hypervideo_dl/extractor/daum.py | 5 +- hypervideo_dl/extractor/daystar.py | 48 + hypervideo_dl/extractor/digitalconcerthall.py | 141 + hypervideo_dl/extractor/disney.py | 9 +- hypervideo_dl/extractor/dispeak.py | 3 +- hypervideo_dl/extractor/dlive.py | 2 +- hypervideo_dl/extractor/doodstream.py | 37 +- hypervideo_dl/extractor/douyutv.py | 2 +- hypervideo_dl/extractor/dplay.py | 857 ++++-- hypervideo_dl/extractor/drooble.py | 116 + hypervideo_dl/extractor/dropbox.py | 44 +- hypervideo_dl/extractor/dropout.py | 212 ++ hypervideo_dl/extractor/drtv.py | 18 +- hypervideo_dl/extractor/dvtv.py | 7 +- hypervideo_dl/extractor/egghead.py | 1 - hypervideo_dl/extractor/ellentube.py | 3 +- hypervideo_dl/extractor/elonet.py | 85 +- hypervideo_dl/extractor/engadget.py | 10 - hypervideo_dl/extractor/epicon.py | 4 +- hypervideo_dl/extractor/eroprofile.py | 9 +- hypervideo_dl/extractor/ertgr.py | 316 ++ hypervideo_dl/extractor/espn.py | 43 + hypervideo_dl/extractor/europeantour.py | 37 + hypervideo_dl/extractor/euscreen.py | 2 +- hypervideo_dl/extractor/extractors.py | 366 ++- hypervideo_dl/extractor/facebook.py | 158 +- hypervideo_dl/extractor/fancode.py | 41 +- hypervideo_dl/extractor/fc2.py | 201 +- hypervideo_dl/extractor/filmon.py | 2 +- hypervideo_dl/extractor/fivetv.py | 3 +- hypervideo_dl/extractor/flickr.py | 3 +- hypervideo_dl/extractor/fox.py | 39 +- hypervideo_dl/extractor/foxgay.py | 3 +- hypervideo_dl/extractor/fptplay.py | 102 + hypervideo_dl/extractor/franceculture.py | 101 +- hypervideo_dl/extractor/francetv.py | 6 +- hypervideo_dl/extractor/frontendmasters.py | 13 +- hypervideo_dl/extractor/fujitv.py | 70 +- hypervideo_dl/extractor/funimation.py | 25 +- hypervideo_dl/extractor/funk.py | 2 +- hypervideo_dl/extractor/gab.py | 89 +- hypervideo_dl/extractor/gaia.py | 30 +- hypervideo_dl/extractor/gamejolt.py | 541 ++++ hypervideo_dl/extractor/generic.py | 461 ++- hypervideo_dl/extractor/gettr.py | 159 +- hypervideo_dl/extractor/gfycat.py | 43 +- hypervideo_dl/extractor/glide.py | 4 +- hypervideo_dl/extractor/globo.py | 43 +- hypervideo_dl/extractor/glomex.py | 220 ++ hypervideo_dl/extractor/go.py | 8 +- hypervideo_dl/extractor/gofile.py | 83 + hypervideo_dl/extractor/googlesearch.py | 21 +- hypervideo_dl/extractor/gronkh.py | 5 +- hypervideo_dl/extractor/hellporno.py | 3 +- hypervideo_dl/extractor/hidive.py | 8 +- hypervideo_dl/extractor/hitbox.py | 2 +- hypervideo_dl/extractor/hotstar.py | 10 +- hypervideo_dl/extractor/hrfensehen.py | 10 +- hypervideo_dl/extractor/hrti.py | 15 +- hypervideo_dl/extractor/hse.py | 95 + hypervideo_dl/extractor/huffpost.py | 3 - hypervideo_dl/extractor/huya.py | 137 + hypervideo_dl/extractor/imdb.py | 64 +- hypervideo_dl/extractor/imggaming.py | 22 +- hypervideo_dl/extractor/infoq.py | 2 +- hypervideo_dl/extractor/instagram.py | 552 ++-- hypervideo_dl/extractor/internazionale.py | 6 - hypervideo_dl/extractor/iprima.py | 145 +- hypervideo_dl/extractor/iqiyi.py | 377 ++- hypervideo_dl/extractor/itprotv.py | 141 + hypervideo_dl/extractor/itv.py | 44 +- hypervideo_dl/extractor/ivideon.py | 2 +- hypervideo_dl/extractor/iwara.py | 3 +- hypervideo_dl/extractor/jamendo.py | 2 +- hypervideo_dl/extractor/joj.py | 3 +- hypervideo_dl/extractor/kakao.py | 46 +- hypervideo_dl/extractor/kaltura.py | 11 +- hypervideo_dl/extractor/keezmovies.py | 3 +- hypervideo_dl/extractor/kelbyone.py | 84 + hypervideo_dl/extractor/kinopoisk.py | 3 - hypervideo_dl/extractor/koo.py | 2 +- hypervideo_dl/extractor/la7.py | 54 +- hypervideo_dl/extractor/laola1tv.py | 4 +- hypervideo_dl/extractor/lastfm.py | 129 + hypervideo_dl/extractor/lbry.py | 43 +- hypervideo_dl/extractor/lecturio.py | 9 +- hypervideo_dl/extractor/lego.py | 7 +- hypervideo_dl/extractor/limelight.py | 2 +- hypervideo_dl/extractor/line.py | 112 +- hypervideo_dl/extractor/linkedin.py | 100 +- hypervideo_dl/extractor/linuxacademy.py | 9 +- hypervideo_dl/extractor/litv.py | 23 +- hypervideo_dl/extractor/livestream.py | 4 +- hypervideo_dl/extractor/lnkgo.py | 88 +- hypervideo_dl/extractor/lynda.py | 11 +- hypervideo_dl/extractor/mainstreaming.py | 219 ++ hypervideo_dl/extractor/mangomolo.py | 2 +- hypervideo_dl/extractor/manyvids.py | 1 + hypervideo_dl/extractor/matchtv.py | 2 +- hypervideo_dl/extractor/mdr.py | 12 +- hypervideo_dl/extractor/medaltv.py | 3 +- hypervideo_dl/extractor/mediaklikk.py | 4 +- hypervideo_dl/extractor/mediaset.py | 165 +- hypervideo_dl/extractor/mediasite.py | 11 +- hypervideo_dl/extractor/megatvcom.py | 173 ++ hypervideo_dl/extractor/mgtv.py | 59 +- hypervideo_dl/extractor/miaopai.py | 3 +- hypervideo_dl/extractor/microsoftstream.py | 125 + hypervideo_dl/extractor/mildom.py | 336 ++- hypervideo_dl/extractor/minds.py | 3 +- hypervideo_dl/extractor/mirrativ.py | 83 +- hypervideo_dl/extractor/mixch.py | 85 + hypervideo_dl/extractor/mixcloud.py | 16 +- hypervideo_dl/extractor/mlssoccer.py | 117 + hypervideo_dl/extractor/mojvideo.py | 3 +- hypervideo_dl/extractor/mtv.py | 17 +- hypervideo_dl/extractor/muenchentv.py | 2 +- hypervideo_dl/extractor/murrtube.py | 165 ++ hypervideo_dl/extractor/musescore.py | 8 +- hypervideo_dl/extractor/musicdex.py | 175 ++ hypervideo_dl/extractor/mxplayer.py | 2 +- hypervideo_dl/extractor/myspass.py | 63 +- hypervideo_dl/extractor/n1.py | 22 +- hypervideo_dl/extractor/nate.py | 124 + hypervideo_dl/extractor/naver.py | 7 +- hypervideo_dl/extractor/nba.py | 12 +- hypervideo_dl/extractor/nbc.py | 27 +- hypervideo_dl/extractor/ndr.py | 2 - hypervideo_dl/extractor/nebula.py | 368 +-- hypervideo_dl/extractor/neteasemusic.py | 13 +- hypervideo_dl/extractor/newgrounds.py | 25 +- hypervideo_dl/extractor/newstube.py | 10 +- hypervideo_dl/extractor/newsy.py | 51 + hypervideo_dl/extractor/nexx.py | 147 +- hypervideo_dl/extractor/nfb.py | 62 + hypervideo_dl/extractor/nfl.py | 2 +- hypervideo_dl/extractor/nhk.py | 152 +- hypervideo_dl/extractor/niconico.py | 823 +++--- hypervideo_dl/extractor/ninecninemedia.py | 35 +- hypervideo_dl/extractor/nitter.py | 221 +- hypervideo_dl/extractor/njpwworld.py | 19 +- hypervideo_dl/extractor/noco.py | 9 +- hypervideo_dl/extractor/noodlemagazine.py | 67 + hypervideo_dl/extractor/nova.py | 34 +- hypervideo_dl/extractor/novaplay.py | 4 +- hypervideo_dl/extractor/npo.py | 4 +- hypervideo_dl/extractor/npr.py | 3 +- hypervideo_dl/extractor/nrk.py | 13 +- hypervideo_dl/extractor/nrl.py | 1 - hypervideo_dl/extractor/ntvcojp.py | 27 +- hypervideo_dl/extractor/nuvid.py | 49 +- hypervideo_dl/extractor/odnoklassniki.py | 97 +- hypervideo_dl/extractor/oktoberfesttv.py | 4 +- hypervideo_dl/extractor/olympics.py | 71 +- hypervideo_dl/extractor/ondemandkorea.py | 6 +- hypervideo_dl/extractor/onefootball.py | 51 + hypervideo_dl/extractor/onet.py | 7 +- hypervideo_dl/extractor/opencast.py | 177 ++ hypervideo_dl/extractor/openload.py | 14 +- hypervideo_dl/extractor/openrec.py | 161 +- hypervideo_dl/extractor/orf.py | 231 +- hypervideo_dl/extractor/packtpub.py | 5 +- hypervideo_dl/extractor/panopto.py | 607 ++++ hypervideo_dl/extractor/paramountplus.py | 31 +- hypervideo_dl/extractor/parliamentliveuk.py | 3 - hypervideo_dl/extractor/patreon.py | 12 +- hypervideo_dl/extractor/pbs.py | 7 +- hypervideo_dl/extractor/peekvids.py | 81 + hypervideo_dl/extractor/peertube.py | 5 +- hypervideo_dl/extractor/peertv.py | 57 + hypervideo_dl/extractor/peloton.py | 1 - hypervideo_dl/extractor/periscope.py | 2 +- hypervideo_dl/extractor/piapro.py | 96 + hypervideo_dl/extractor/picarto.py | 4 +- hypervideo_dl/extractor/piksel.py | 10 +- hypervideo_dl/extractor/pixivsketch.py | 122 + hypervideo_dl/extractor/pladform.py | 26 +- hypervideo_dl/extractor/planetmarathi.py | 76 + hypervideo_dl/extractor/platzi.py | 9 +- hypervideo_dl/extractor/playplustv.py | 12 +- hypervideo_dl/extractor/playtvak.py | 2 - hypervideo_dl/extractor/playvid.py | 3 +- hypervideo_dl/extractor/pluralsight.py | 9 +- hypervideo_dl/extractor/plutotv.py | 7 +- hypervideo_dl/extractor/pokemon.py | 40 + hypervideo_dl/extractor/pokergo.py | 109 + hypervideo_dl/extractor/polsatgo.py | 90 + hypervideo_dl/extractor/polskieradio.py | 303 +- hypervideo_dl/extractor/pornez.py | 43 + hypervideo_dl/extractor/pornflip.py | 1 - hypervideo_dl/extractor/pornhub.py | 16 +- hypervideo_dl/extractor/projectveritas.py | 2 +- hypervideo_dl/extractor/prx.py | 431 +++ hypervideo_dl/extractor/radiode.py | 2 +- hypervideo_dl/extractor/radiokapital.py | 99 + hypervideo_dl/extractor/radiozet.py | 51 + hypervideo_dl/extractor/radlive.py | 10 +- hypervideo_dl/extractor/rai.py | 198 +- hypervideo_dl/extractor/rcti.py | 128 +- hypervideo_dl/extractor/redbulltv.py | 3 +- hypervideo_dl/extractor/reddit.py | 86 +- hypervideo_dl/extractor/redgifs.py | 232 ++ hypervideo_dl/extractor/redtube.py | 35 +- hypervideo_dl/extractor/rmcdecouverte.py | 1 - hypervideo_dl/extractor/rokfin.py | 256 ++ hypervideo_dl/extractor/roosterteeth.py | 208 +- hypervideo_dl/extractor/rtbf.py | 2 - hypervideo_dl/extractor/rtl2.py | 16 +- hypervideo_dl/extractor/rtnews.py | 199 ++ hypervideo_dl/extractor/rtrfm.py | 67 + hypervideo_dl/extractor/rtve.py | 95 +- hypervideo_dl/extractor/rtvs.py | 74 +- hypervideo_dl/extractor/rule34video.py | 65 + hypervideo_dl/extractor/rumble.py | 17 +- hypervideo_dl/extractor/rutube.py | 21 +- hypervideo_dl/extractor/rutv.py | 13 +- hypervideo_dl/extractor/ruutu.py | 15 + hypervideo_dl/extractor/ruv.py | 88 + hypervideo_dl/extractor/safari.py | 9 +- hypervideo_dl/extractor/sbs.py | 17 +- hypervideo_dl/extractor/scte.py | 9 +- hypervideo_dl/extractor/senategov.py | 213 ++ hypervideo_dl/extractor/sendtonews.py | 2 +- hypervideo_dl/extractor/sevenplus.py | 1 - hypervideo_dl/extractor/shahid.py | 8 +- hypervideo_dl/extractor/shemaroome.py | 11 +- hypervideo_dl/extractor/showroomlive.py | 2 +- hypervideo_dl/extractor/skeb.py | 143 + hypervideo_dl/extractor/sky.py | 28 + hypervideo_dl/extractor/skyit.py | 7 +- hypervideo_dl/extractor/skylinewebcams.py | 2 +- hypervideo_dl/extractor/skynewsau.py | 2 +- hypervideo_dl/extractor/slideslive.py | 3 - hypervideo_dl/extractor/sonyliv.py | 60 +- hypervideo_dl/extractor/soundcloud.py | 344 ++- hypervideo_dl/extractor/southpark.py | 17 +- hypervideo_dl/extractor/sovietscloset.py | 15 +- hypervideo_dl/extractor/spiegel.py | 2 +- hypervideo_dl/extractor/sportdeutschland.py | 8 +- hypervideo_dl/extractor/srgssr.py | 7 +- hypervideo_dl/extractor/steam.py | 140 +- hypervideo_dl/extractor/storyfire.py | 17 +- hypervideo_dl/extractor/streamcz.py | 173 +- hypervideo_dl/extractor/streamff.py | 31 + hypervideo_dl/extractor/stripchat.py | 66 + hypervideo_dl/extractor/stv.py | 5 +- hypervideo_dl/extractor/sunporno.py | 3 +- hypervideo_dl/extractor/svt.py | 32 +- hypervideo_dl/extractor/tagesschau.py | 279 +- hypervideo_dl/extractor/teachable.py | 3 +- hypervideo_dl/extractor/teamtreehouse.py | 7 +- hypervideo_dl/extractor/ted.py | 477 ++-- hypervideo_dl/extractor/tele5.py | 87 +- hypervideo_dl/extractor/telebruxelles.py | 2 +- hypervideo_dl/extractor/telegram.py | 37 + hypervideo_dl/extractor/telemundo.py | 5 +- hypervideo_dl/extractor/telequebec.py | 12 - hypervideo_dl/extractor/tennistv.py | 9 +- hypervideo_dl/extractor/tenplay.py | 44 +- hypervideo_dl/extractor/tf1.py | 1 - hypervideo_dl/extractor/theta.py | 10 +- hypervideo_dl/extractor/thisav.py | 4 +- hypervideo_dl/extractor/thisoldhouse.py | 17 +- hypervideo_dl/extractor/threeqsdn.py | 18 +- hypervideo_dl/extractor/threespeak.py | 97 + hypervideo_dl/extractor/tiktok.py | 449 ++- hypervideo_dl/extractor/toggo.py | 73 + hypervideo_dl/extractor/tokentube.py | 12 +- hypervideo_dl/extractor/tonline.py | 9 +- hypervideo_dl/extractor/toutv.py | 7 +- hypervideo_dl/extractor/traileraddict.py | 3 +- hypervideo_dl/extractor/trovo.py | 43 +- hypervideo_dl/extractor/trueid.py | 139 + hypervideo_dl/extractor/tubitv.py | 20 +- hypervideo_dl/extractor/tumblr.py | 408 ++- hypervideo_dl/extractor/tunein.py | 2 +- hypervideo_dl/extractor/turner.py | 2 +- hypervideo_dl/extractor/tv2.py | 17 +- hypervideo_dl/extractor/tv2dk.py | 17 +- hypervideo_dl/extractor/tver.py | 37 +- hypervideo_dl/extractor/tvnet.py | 7 +- hypervideo_dl/extractor/tvopengr.py | 128 + hypervideo_dl/extractor/tvp.py | 461 ++- hypervideo_dl/extractor/tvplay.py | 114 +- hypervideo_dl/extractor/tvplayer.py | 2 +- hypervideo_dl/extractor/twitcasting.py | 166 +- hypervideo_dl/extractor/twitch.py | 96 +- hypervideo_dl/extractor/twitter.py | 13 +- hypervideo_dl/extractor/udemy.py | 9 +- hypervideo_dl/extractor/uol.py | 1 - hypervideo_dl/extractor/urplay.py | 53 +- hypervideo_dl/extractor/ustream.py | 5 +- hypervideo_dl/extractor/utreon.py | 2 +- hypervideo_dl/extractor/varzesh3.py | 3 +- hypervideo_dl/extractor/veo.py | 47 +- hypervideo_dl/extractor/veoh.py | 62 +- hypervideo_dl/extractor/vgtv.py | 6 +- hypervideo_dl/extractor/vice.py | 1 - hypervideo_dl/extractor/videa.py | 9 +- hypervideo_dl/extractor/videocampus_sachsen.py | 96 + hypervideo_dl/extractor/vidio.py | 14 +- hypervideo_dl/extractor/vidlii.py | 50 +- hypervideo_dl/extractor/viewlift.py | 189 +- hypervideo_dl/extractor/viki.py | 30 +- hypervideo_dl/extractor/vimeo.py | 521 ++-- hypervideo_dl/extractor/vimm.py | 69 + hypervideo_dl/extractor/vine.py | 3 +- hypervideo_dl/extractor/viu.py | 226 +- hypervideo_dl/extractor/vk.py | 118 +- hypervideo_dl/extractor/vlive.py | 256 +- hypervideo_dl/extractor/voicy.py | 7 +- hypervideo_dl/extractor/voot.py | 2 +- hypervideo_dl/extractor/vrv.py | 67 +- hypervideo_dl/extractor/vshare.py | 3 +- hypervideo_dl/extractor/vupload.py | 12 +- hypervideo_dl/extractor/vyborymos.py | 4 +- hypervideo_dl/extractor/wakanim.py | 26 +- hypervideo_dl/extractor/wasdtv.py | 161 ++ hypervideo_dl/extractor/washingtonpost.py | 21 +- hypervideo_dl/extractor/watchbox.py | 2 - hypervideo_dl/extractor/wdr.py | 65 +- hypervideo_dl/extractor/webcaster.py | 8 +- hypervideo_dl/extractor/weibo.py | 3 +- hypervideo_dl/extractor/whowatch.py | 9 +- hypervideo_dl/extractor/willow.py | 58 + hypervideo_dl/extractor/wppilot.py | 177 ++ hypervideo_dl/extractor/xinpianchang.py | 95 + hypervideo_dl/extractor/xnxx.py | 5 +- hypervideo_dl/extractor/xvideos.py | 32 +- hypervideo_dl/extractor/yahoo.py | 46 +- hypervideo_dl/extractor/yandexvideo.py | 99 +- hypervideo_dl/extractor/youjizz.py | 3 +- hypervideo_dl/extractor/younow.py | 5 +- hypervideo_dl/extractor/youtube.py | 3657 ++++++++++++++++-------- hypervideo_dl/extractor/zattoo.py | 25 +- hypervideo_dl/extractor/zdf.py | 61 +- hypervideo_dl/extractor/zee5.py | 117 +- hypervideo_dl/extractor/zhihu.py | 4 +- hypervideo_dl/extractor/zingmp3.py | 159 +- hypervideo_dl/extractor/zoom.py | 40 +- hypervideo_dl/jsinterp.py | 492 +++- hypervideo_dl/minicurses.py | 86 +- hypervideo_dl/options.py | 531 ++-- hypervideo_dl/postprocessor/__init__.py | 8 +- hypervideo_dl/postprocessor/common.py | 44 +- hypervideo_dl/postprocessor/embedthumbnail.py | 77 +- hypervideo_dl/postprocessor/exec.py | 21 +- hypervideo_dl/postprocessor/ffmpeg.py | 519 ++-- hypervideo_dl/postprocessor/metadataparser.py | 29 +- hypervideo_dl/postprocessor/modify_chapters.py | 22 +- hypervideo_dl/postprocessor/sponskrub.py | 13 +- hypervideo_dl/postprocessor/sponsorblock.py | 37 +- hypervideo_dl/utils.py | 2738 ++++++------------ hypervideo_dl/version.py | 6 +- hypervideo_dl/webvtt.py | 8 +- requirements.txt | 3 + setup.py | 6 +- test/helper.py | 47 +- test/parameters.json | 2 +- test/test_InfoExtractor.py | 184 +- test/test_YoutubeDL.py | 56 +- test/test_aes.py | 18 +- test/test_all_urls.py | 1 - test/test_cookies.py | 36 +- test/test_download.py | 2 +- test/test_netrc.py | 13 +- test/test_postprocessors.py | 4 +- test/test_subtitles.py | 4 +- test/test_utils.py | 214 +- test/test_verbose_output.py | 16 +- test/test_youtube_lists.py | 42 +- 475 files changed, 32120 insertions(+), 11787 deletions(-) create mode 100644 AUTHORS mode change 100755 => 100644 bin/hypervideo create mode 100644 completions/zsh/_hypervideo mode change 100755 => 100644 hypervideo_dl/YoutubeDL.py mode change 100755 => 100644 hypervideo_dl/__main__.py create mode 100644 hypervideo_dl/downloader/fc2.py create mode 100644 hypervideo_dl/extractor/abematv.py create mode 100644 hypervideo_dl/extractor/alsace20tv.py create mode 100644 hypervideo_dl/extractor/amazon.py create mode 100644 hypervideo_dl/extractor/ant1newsgr.py create mode 100644 hypervideo_dl/extractor/banbye.py create mode 100644 hypervideo_dl/extractor/bigo.py create mode 100644 hypervideo_dl/extractor/blogger.py create mode 100644 hypervideo_dl/extractor/breitbart.py create mode 100644 hypervideo_dl/extractor/cableav.py create mode 100644 hypervideo_dl/extractor/callin.py create mode 100644 hypervideo_dl/extractor/caltrans.py create mode 100644 hypervideo_dl/extractor/canalalpha.py create mode 100644 hypervideo_dl/extractor/cozytv.py create mode 100644 hypervideo_dl/extractor/cpac.py create mode 100644 hypervideo_dl/extractor/craftsy.py create mode 100644 hypervideo_dl/extractor/crowdbunker.py create mode 100644 hypervideo_dl/extractor/cybrary.py create mode 100644 hypervideo_dl/extractor/daftsex.py create mode 100644 hypervideo_dl/extractor/daystar.py create mode 100644 hypervideo_dl/extractor/digitalconcerthall.py create mode 100644 hypervideo_dl/extractor/drooble.py create mode 100644 hypervideo_dl/extractor/dropout.py create mode 100644 hypervideo_dl/extractor/ertgr.py create mode 100644 hypervideo_dl/extractor/europeantour.py create mode 100644 hypervideo_dl/extractor/fptplay.py create mode 100644 hypervideo_dl/extractor/gamejolt.py create mode 100644 hypervideo_dl/extractor/glomex.py create mode 100644 hypervideo_dl/extractor/gofile.py create mode 100644 hypervideo_dl/extractor/hse.py create mode 100644 hypervideo_dl/extractor/huya.py create mode 100644 hypervideo_dl/extractor/itprotv.py create mode 100644 hypervideo_dl/extractor/kelbyone.py create mode 100644 hypervideo_dl/extractor/lastfm.py create mode 100644 hypervideo_dl/extractor/mainstreaming.py create mode 100644 hypervideo_dl/extractor/megatvcom.py create mode 100644 hypervideo_dl/extractor/microsoftstream.py create mode 100644 hypervideo_dl/extractor/mixch.py create mode 100644 hypervideo_dl/extractor/mlssoccer.py create mode 100644 hypervideo_dl/extractor/murrtube.py create mode 100644 hypervideo_dl/extractor/musicdex.py create mode 100644 hypervideo_dl/extractor/nate.py create mode 100644 hypervideo_dl/extractor/newsy.py create mode 100644 hypervideo_dl/extractor/nfb.py create mode 100644 hypervideo_dl/extractor/noodlemagazine.py create mode 100644 hypervideo_dl/extractor/onefootball.py create mode 100644 hypervideo_dl/extractor/opencast.py create mode 100644 hypervideo_dl/extractor/panopto.py create mode 100644 hypervideo_dl/extractor/peekvids.py create mode 100644 hypervideo_dl/extractor/peertv.py create mode 100644 hypervideo_dl/extractor/piapro.py create mode 100644 hypervideo_dl/extractor/pixivsketch.py create mode 100644 hypervideo_dl/extractor/planetmarathi.py create mode 100644 hypervideo_dl/extractor/pokergo.py create mode 100644 hypervideo_dl/extractor/polsatgo.py create mode 100644 hypervideo_dl/extractor/pornez.py create mode 100644 hypervideo_dl/extractor/prx.py create mode 100644 hypervideo_dl/extractor/radiokapital.py create mode 100644 hypervideo_dl/extractor/radiozet.py create mode 100644 hypervideo_dl/extractor/redgifs.py create mode 100644 hypervideo_dl/extractor/rokfin.py create mode 100644 hypervideo_dl/extractor/rtnews.py create mode 100644 hypervideo_dl/extractor/rtrfm.py create mode 100644 hypervideo_dl/extractor/rule34video.py create mode 100644 hypervideo_dl/extractor/senategov.py create mode 100644 hypervideo_dl/extractor/skeb.py create mode 100644 hypervideo_dl/extractor/streamff.py create mode 100644 hypervideo_dl/extractor/stripchat.py create mode 100644 hypervideo_dl/extractor/telegram.py create mode 100644 hypervideo_dl/extractor/threespeak.py create mode 100644 hypervideo_dl/extractor/toggo.py create mode 100644 hypervideo_dl/extractor/trueid.py create mode 100644 hypervideo_dl/extractor/tvopengr.py create mode 100644 hypervideo_dl/extractor/videocampus_sachsen.py create mode 100644 hypervideo_dl/extractor/vimm.py create mode 100644 hypervideo_dl/extractor/wasdtv.py create mode 100644 hypervideo_dl/extractor/willow.py create mode 100644 hypervideo_dl/extractor/wppilot.py create mode 100644 hypervideo_dl/extractor/xinpianchang.py mode change 100644 => 100755 test/test_download.py diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..33923ec --- /dev/null +++ b/AUTHORS @@ -0,0 +1,1136 @@ +0l-l0 +0x9fff00 +1-Byte +23rd +2ShedsJackson +3risian +4a1e2y5 +4rensiker +50csent +5moufl +A Connecticut Princess +AGSPhoenix +Aakash Gajjar +Aarni Koskela +Aaron Brager +Aaron Lipinski +Aaron Wojnowski +Aaron Zeng +Abdullah Ibn Fulan +Abhishek Kedia +Adam +Adam Glenn +Adam Malcontenti-Wilson +Adam Mesha +Adam Thalhammer +Adam Voss +Adrian Heine né Lang +Adrian Kretz +Adrik +Aidan Rowe +Alan Yee +Albert Kim +Aldo Gunsing +Aleksandar Topuzovic +Aleksander Nitecki +Aleri Kaisattera +Ales Jirasek +Alessandro Ghedini +Alex Merkel +Alex Monk +Alex Seiler +Alex Van't Hof +Alex Vong +Alexander Kirk +Alexander Simon +Alexander van Gessel +Alexandre Huot +Alexandre Macabies +Alexey Trofimov +Alf Marius +Alfonso Solbes +Ali Irani +Ali Sherief +Allan Daemon +Allan Zhou +Alpesh Valia +Amaury Gauthier +Amish Bhadeshia +Anand Babu Periasamy +Anarky +Anders Einar Hilden +Andras Elso +Andre Walker +Andreas Schmitz +Andrei Troie +AndreiArba +Andrew "Akari" Alexeyew +Andrew Bottom +Andrew J. Erickson +Andrew Morgan +Andrew Udvare +AndrewMBL +Andrey Smirnoff +AndroKev +Andrzej Lichnerowicz +András Veres-Szentkirályi +Andy Savicki +Anh Nhan Nguyen +Aniruddh Joshi +Aniruddh-J +Anisse Astier +Anna Bernardi +Anssi Hannula +Anthony Fok +Anthony J. Bentley +Anthony Weems +Anton Larionov +Anton Novosyolov +Antti Ajanki +Arend v. Reinersdorff +Argn0 +Ariset Llerena +Arjan Verwer +Arjun Sreedharan +Art Zhitnik +Arvydas Sidorenko +Ashish Gupta +Ashutosh Chaudhary +Atlas Sullivan +Attila-Mihaly Balazs +Aurora +Aurélien Dunand +Aurélien Grosdidier +Aurélio A. Heckert +Austin Adams +Austin de Coup-Crank +Awal Garg +Bagira +Barbara Miller +Barbu Paul - Gheorghe +Bart Kappenburg +Bastian de Groot +Batuhan's Unmaintained Account +Behrooz +Ben Rog-Wilhelm +Benedikt Wildenhain +Benjamin Congdon +Bepis +Bernhard M. Wiedemann +Bjorn Heesakkers +BlahGeek +Bob Poekert +BohwaZ +Bojidar Qnkov +Boris Wachtmeister +Brian Foley +Brian Marks +Bricio +BunnyHelp +CHJ85 +CXwudi +Camillo Dell'mour +Carlos Ramos +Celthi +CeruleanSky +Cédric Luthi +Charles Chen +Charlie Le +ChillingPepper +Ching Yi, Chan +Chirantan Ekbote +Chris Gavin +Chris Hranj +Christian Albrecht +Christian Paul +Christian Pointner +Christoph Döpmann +Christopher Krooss +Christopher Neugebauer +Christopher Smith +Chuck Cho +Cian Ruane +CkuT +Clément DAVID +Corey Farwell +Corey Nicholson +Cory Hall +Costy Petrisor +CplPwnies +Craig Markwardt +CrypticSignal +CyberJacob +Cyril Roelandt +Cássio Ávila +DEvmIb +DaMightyZombie +Daan van Vugt +Damiano Amatruda +Damon Timm +Dan Church +Dan Salmon +Dan Walker +Dan Weber +Daniel +Daniel Bolton +Daniel Höpfl +Daniel Peukert +Daniel Twardowski +Daniel.Zeng +Danko Alexeyev +Dankryn +Dao Hoang Son +Dario Guarascio +DarkZeros +DarkstaIkers +Dave +Dave Loyall +Dave Vasilevsky +David +David Bauer +David Ben Zakai +David Caldwell +David Coppa +David Development +David Fabijan +David Haberthür +David Powell +David Rabinowitz +David Skrundz +David Triendl +David Wagner +Deer-Spangle +Delon +Derek Land +DesweR +Devin J. Pohly +Devon Meunier +Diego Fernando Rodríguez Varón +DigitalDJ +Dimitre Liotev +Dobrosław Żybort +Dominik +Dominik Heidler +Dorian Westacott +Douglas Su +DrWursterich +Dracony +DroidFreak32 +Duncan +Duncan Keall +Déstin Reed +Eduardo Ferro +Edward Betts +Eitan Adler +Eitan Postavsky +Elan Ruusamäe +Elias Probst +Emanuel Hoogeveen +Emilien Kenler +Emmanuel Froissart +Enes +EntranceJew +Entropy +Eric Wong +Erik +Erik Johnson +Erwin de Haan +FND +Fabian Stahl +Fai +Fam0r +Felix S +Felix Stupp +Felix Yan +FestplattenSchnitzel +Filip B +Filippo Valsorda +Finn Petersen +FireDart +FliegendeWurst +FooBarQuaxx +Founder Fang +Francesco Frassinelli +Francois du Toit +Frans de Jonge +François Charlier +François Revol +Frederic Bournival +GDR! +Gabriel Schubiner +Gaetan Gilbert +Gary +Gaurav +Gautam M +Genki Sky +Georg Jaehnig +George Boyle +George Brighton +George Schizas +Georgi Saev +Georgi Valkov +Gergely Imreh +Giedrius Statkevičius +Gilles Pietri +Gino Lisignoli +Giovanni Visentini +Giuseppe Fabiano +Gjorgji Jankovski +Glenn Slayden +Gorfiend +Grabien +GreyAlien502 +Grom PE +Grzegorz P +Grzegorz Ruciński +Guillem Vela +Ha Tien Loi +Hadi0609 +Hakim Boyles +Han Dai +HanYOLO +Hannu Hartikainen +Hannu Lintala +Haricharan Padmanaban +Hendrik Schröter +Hendrik v. Raven +Henrik Heimbuerger +Hirokuni Yano +Hongjie Dong +Hormoz K +Hubert Hirtz +Hugo Alves De Azevedo +Huyuumi +IONECarter +Idan Kamara +InfernalUnderling +Irfan Charania +Isaac-the-Man +Ismael Mejia +Itay Brandes +Iulian Onofrei +Ivan Kozik +J +J.D. Purcell +JChris246 +Jack Danger Canty +Jacob Chapman +Jacob Kaplan-Moss +Jai Grimshaw +Jaime Marquínez Ferrándiz +Jaime Marquínez Ferrándiz +Jakub Adam Wieczorek +Jakub Wilk +Jalaz Kumar +JamKage +Jan 'Yenda' Trmal +Jan Friesse +Jan Kratochvil +Jan Kundrát +Jan Schär +Janez Troha +Jason Normore +Jason Terk +Jay +Jeff Buchbinder +Jeff Crouse +Jeff Huffman +Jeff Smith +Jelle van der Waa +Jens Rutschmann +Jens Timmerman +Jens Wille +Jeremie J. Jarosh +Jertzukka +Jesse +Jesse de Zwart +Jesús +Jia Rong Yee +JianxinLi +Jimbolino +Jimm Stout +Joakim Fremstad +Jody Bruchon +Joe Frambach +Joel Potts +Joel Verhagen +Joey Adams +Johan +Johan K. Jensen +Johannes Knoedtel +Johannes N +John Assael +John Boehr +John D +John Hawkinson +John Peel +Johny Mo Swag +Joost Verdoorn +Joram Schrijver +JordanWeatherby +Joseph Frazier +Joseph Spiros +Josh Soref +Joshua Elsasser +Joshua Lochner +Josu Moreno +Jouke Waleson +Juan C. Olivares +Juan Carlos Garcia Segovia +Juan Francisco Cantero Hurtado +Juan M +Juanjo Benages +Jules-A +Julien Hadley Jack +Justin Keogh +Justin Quan +Justsoos +Jérôme Duval +Kacper Michajłow +Kagami Hiiragi +Kai Weber +Kang Hyojun +Kareem Moussa +Kazuma Takahara +Kegan +Keith Beckman +Ken Swenson +Kevin Deldycke +Kevin Kwan +Kevin Ngo +Kevin O'Connor +Kevin Velghe +Kfir Breger +Khang Nguyen +KiberInfinity +Kid +Kieran O'Reilly +Kitten King +Kyle +Kyu Yeun Kim +LE +Laneone +LangerJan +Lapinot +Lars Vierbergen +Lauren Liberda +Laurent Raufaste +Leonardo Amaral +Leonardo Taccari +Leslie P. Polzer +Lesmiscore (Naoya Ozaki) +Li4ick +Lionel Elie Mamane +Liu DongMiao +Logan B +Logan Fleur +Lovius +Luc Ritchie +Luca Cherubin +Luca Steeb +Lucas +Lucas M +Lucas Moura +Lukas Anzinger +Lukas Fink +Lukáš Lalinský +Léo El Amri +M.K +M.Yasoob Khalid +MAA +MMM +MRWITEK +Magnus Kolstad +Malte Kiefer +Mamay Alexander +Mantas Mikulėnas +Manu Cornet +Mao Zedong +Marcin Cieślak +Marco Fantauzzo +Marco Ferragina +Marco Schuster +Marek Rusinowski +Marian Sigler +Mark Lee +Mark Oteiza +Mark Schreiber +Markus Müller +Martin Michlmayr +Martin Polden +Martin Ström +Martin Trigaux +Martin Weinelt +Marvin Ewald +Matej Dujava +Mathias Rav +Mats +Matt Broadway +Matt Crupi +Matthew Franglen +Matthew Rayermann +Matthew Rayfield +Matthieu Muffato +Mattias Harrysson +Mattias Wadman +Matěj Cepl +Max +Max Mehl +Max Teegen +MaxReimann +Mel Shafer +Meneth32 +Mevious +Michael Haggerty +Michael Kaiser +Michael Klein +Michael Käufl +Michael Munch +Michael Orlitzky +Michael Pauley +Michael Smith +Michael Tilbury +Michael Walter +Michal Kubeček +Michal Čihař +Mike Fährmann +MikeCol +MinePlayersPE +Miroslav Šedivý +Mister Hat +Mitsukarenai +MobiDotS +Mohamedh Fazal +Mohammad Khaled AbouElSherbini +Mohammad Teimori Pabandi +Mohammed Yaseen Mowzer +Moises Lima +Moritz Patelscheck +MrDoritos +MrRawes +Muratcan Simsek +N1k145 +NRTICN +Naglis Jonaitis +Namnamseo +Nathan Rossi +Nehal Patel +NeroBurner +Nevar Angelo +Nick Daniels +Nicolas Kaiser +Nicolas SAPA +Nicolas Évrard +Nii-90 +Niklas Haas +Niklas Laxström +Nikoli +Nil Admirari +NotFound +Odd Stråbø +OhMyBahGosh +Ole Ernst +Oleg Prutz +Oli Allen +Oliver Freyermuth +Olivier Bilodeau +Ondřej Bárta +Ondřej Caletka +Ori Avtalion +Orn +Osama Khalid +Oskar Cieslik +Oskar Jauch +P-reducible +PB +PC +PSJay +PSlava +Paper +Parmjit Virk +Pascal Brax +Patrice Levesque +Patrick Dessalle +Patrick Griffis +Paul Hartmann +Paul Henning +Paul Ivanov +Paul Wise +Paul Wrubel +Pawit Pornkitprasan +Pccode66 +Pete Hemery +Peter +Peter Oettig +Peter Pitzulo +Peter Rowlands +PeterDing +Petr Kutalek +Petr Novák +Petr Vaněk +Petr Zvoníček +Phil Kulak +Philip Huppert +Philip Xu +Philipp Hagemeister +Philipp Stehle +Phạm Ngọc Quang Nam +Pierre +Pierre Fenoll +Pierre Mdawar +Pierre Rudloff +PilzAdam +PishPosh.McGee +Pornophage +Poschi +Pratyush Singh +PrinceOfPuppers +Protuhj +Puck Meerburg +Purdea Andrei +Qijiang Fan +Quan Hua +Quentin Rameau +RPing +Rafal Borczuch +Ralf Haring +Random User +Raphael Michel +Rasmus Rendal +Rastislav Barlik +Ray Douglass +Remita Amine +Reto Kromer +Reventl0v +RexYuan +RiCON +Ricardo +Ricardo Constantino +Ricardo Garcia +Richard Clamp +Rob +Rob van Bekkum +Robert Smith +Robin +Robin Dunn +Robin Houtevelts +Robin Neatherway +Rogério Brito +Roland Hieber +Roman Beránek +Roman Le Négrate +Roman Sebastian Karwacik +RomanEmelyanov +Ronald Ip +Ronnnny +Roxedus +Ruirize +Ryan Hendrickson +Ryan Schmidt +Rémy Léone +Sahebjot singh +Saimadhav Heblikar +Sainyam Kapoor +Sam +Samik Some +Sander +Sander van den Oever +Santiago Calcagno +Scott Leggett +Seamus Phelan +Sebastian Blunt +Sebastian Haas +Sebastian Leske +Sematre +Sen Jiang +SeonjaeHyeon +Sergey +Sergey Alirzaev +Sergey M․ +Sergio Livi +Serkora +Shadab Zafar +Shai Coleman +Shaun Walbridge +Shaya G +Shrimadhav U K +Sidney de Koning +Silvan Mosberger +Simon Morgan +Simon W. Jackson +Singwai Chan +Sipherdrakon +SirCipherz +Slava Shklyaev +Soebb +Soneé John +Sonic +Stanislav Kupryakhin +Stanny Nuytkens +Starsam80 +Stavros Ntentos +Stefan Pöschel +Stefan-Gabriel Muscalu +Steffan Donal +Stephan +Stephen Stair +Steven Gosseling +Steven Maude +Sukhbir Singh +Surkal +Surya Oktafendri +SyxbEaEQ2 +TRox1972 +Tailszefox +Takuya Tsuchida +Tatsuyuki Ishi +Teemu Ikonen +TheRealDude2 +Thijs Vermeir +Thomas Christlieb +Thomas Jost +Thomas van der Berg +Thor77 +Throaway +Tianyi Shi +Till Maas +Tim +Tim Broder +Tim Douglas +Tim Landscheidt +Tim Schindler +Tim Sogard +Timendum +Timmy +TinyToweringTree +Tithen-Firion +Tjark Saul +Toan Nguyen +Tobias Bell +Tobias Florek +Tobias Gruetzmacher +Tobias Kunze +Tobias Salzmann +Todoroki +Tom +Tom Gijselinck +Tom-Oliver Heidel +Tomáš Čech +Toni Viemerö +TotalCaesar659 +Trevor Nelson +Tristan Waddington +Tyler Szabo +Unit 193 +Unknown +Urgau +Varun +Vasyl' Vavrychuk +Vid +VietTPham +Vignesh Venkat +Vijay Singh +Viktor Szakats +Viren Rajput +Vitaliy Syrchikov +Vobe +Vrihub +Vukkk +Vítor Galvão +Wandang +Wang Jun Tham +WassimAttar +Wes +Will Glynn +Will Sewell +Windom +Witchakorn Kamolpornwijit +Witold Baryluk +WolfganP +Xaver Hellauer +Xiao Di Guan +Xie Yanbo +Xu Cheng +Xuan Hu (Sean) +Yakabuff +Yasoob +Yen Chi Hsuan +Your Name +Yuan Chao +YuenSzeHong +Yurii H +Yuriy Melnyk +Zach Bruggeman +Zack Fernandes +Zenon Mousmoulas +Zhong Jianxin +Zirro +aarubui +aegamesi +aeph6Ee0 +aerworker +ajj8 +alarig +alimirjamali +alphapapa +alxnull +amigatomte +anatoly techtonik +andi +animelover1984 +anovicecodemonkey +arza +ashutosh-mishra +atomic83 +atomizer +aviperes +axelerometer +aystroganov@gmail.com +azeem +bastik +bato3 +beefchop +bitraid +biwubo +blissland +bonfy +bopol +bpfoley +bzc6p +cant-think-of-a-name +cantandwont +capital-G +catboy +catlover999 +cazulu +cclauss +cdarlint +chaos33 +chaoskagami +charon2019 +chien-yu +chio0hai +chocolateboy +chris +ckuu +cladmi +clauderains +cntrl-s +codelol +codesparkle +coletdev +coletdjnz +compujo +comsomisha +coolsa +coreynicholson +corone17 +cpm +cryptonaut +cryzed +cyberfox1691 +cypheron +d2au +dalan +dannyc@omega +dannycolligan +danut007ro +davex25 +denneboomyo +dequis +dimqua +dinesh +dirkf +dmsummers +dodo +dongmao zhang +dubber0 +dundua +dwemthy +dyn888 +ealgase +enigmaquip +epitron +ericpardee +exwm +f4pp3rk1ng +felix +fiocfun +flatgreen +fluks +fnord +foghawk +forDream +frenchy1983 +funniray +gam2046 +gcmalloc +gdzx +geauxlo +geditorit +git-anony-mouse +github-actions +gkoelln +grimreaper +gritstub +guredora +gustaf +h-collector +ha shao +hakatashi +hassaanaliw +hcwhan +hdclark +hedii +helb +hh0rva1h +hmlinaric +hojel +hrimfaxi +hseg +hub2git +huichen90 +huohuarong +hurda +i6t +ian +igv +inondle +insaneracist +ipaha +ischmidt20 +ispedals +iwconfig +j +j54vc1bk +jahudka +james +james mike dupont +jamiejones +jfogelman +jhwgh1968 +jjatria +jnozsc +joehillen +jomo +julien +jxu +k3ns1n +kaspi +kayb94 +kaz-us +kebianizao +kenavera +kennell +kidol +kikuyan +kinetoskombi +king-millez +kitty +kkalpakloglou +knagano +knapior +kr4ssi +krichbanana +kurumigi +lazypete365 +light94 +lightmare +linhua55 +lkho +llyyr +logon84 +lorpus +louie-github +luboss +luceatnobis +lyz-code +m0viefreak +mahanstreamer +main() +makeworld +marcwebbie +marieell +mars67857 +martin54 +mc2avr +mcd1992 +megustamucho +mehq +mexican porn commits +midas02 +migbac +minusf +mjdubell +mlindner +motophil +mpeter50 +mrBliss +mrkrossxdx +mrtnmtth +mtilbury +mutantmonkey +mzbaulhaque +nawl +nemunaire +net +netanel +neutric +newtonelectron +ngld +niebles +nikhil +nixxo +nmeum +nmrugg +nto +nulloz +nyorain +nyuszika7h +obeythepenguin@gmail.com +octotherp +ofkz +oittaa +opusforlife2 +oteng +ouwou +ovitei +ozburo +pachacamac +patrickslin +peugeot +pgaig +phaer +phan-ctrl +phi +phiresky +phlip +ping +pingtux +piplongrun +pishposhmcgee +plroman +pukkandan +pulpe +pyed +pypy +quinlander +quyleanh +raleeper +random-nick +rawcoder +reddraggone9 +reiv +remis +renalid +rhhayward +rhsmachine +rigstot +riking +rmanola +robbie +robin +rr- +rrooij +rubicks +runningbits +rupertbaxter2 +ruuk +rzhxeo +s0u1h +sahutd +satunnainen +sceext +schn0sch +schnusch +scil +sh!zeeg +shirt-dev +sian1468 +sichuan-pepper +siddharth +siikamiika +skacurt +slangangular +slocum +smed79 +snipem +sofutru +sourcerect +sprhawk +spvkgn +squibbysquibby +ssaqua +stanoarn +std-move +stephen +stepshal +steven7851 +striker.sh +supritkumar +sxvghd +t0mm0 +tandy1000 +teemuy +teesid +telephono +tempname +teridon +testbonn +tetra-eder +tewe +tfvlrue +thc202 +theGeekPirate +theychx +tiktok +timethrow +tinybug +tippfeler +tlonic +tlsssl +tom +toniz4 +trasssh +troywith77 +tsantala +tsia +u-spec-png +user +utlasidyo +v-delta +venth +vijayanand nandam +vobe +vordep +vvto33 +wankerer +willbeaufoy +winwon +wolfy1339 +xantares +xarantolus +xavier +xbe +xofe +xtkoba +xuhaomin +xypwn +xyssy +yac +yonaikerlol +z00nx 0 +zackmark29 +zcanfly +zejn +zenerdi0de +zootedb0t +zouhair +zraktvor +zubearc +zulaport +zurfyx +zx8 +Ákos Sülyi +虾哥哥 +谭九鼎 diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 048d988..8d62c04 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -2,6 +2,7 @@ pukkandan (owner) shirt-dev (collaborator) coletdjnz/colethedj (collaborator) Ashish0804 (collaborator) +nao20010128nao/Lesmiscore (collaborator) h-h-h-h pauldubois98 nixxo @@ -19,7 +20,6 @@ samiksome alxnull FelixFrog Zocker1999NET -nao20010128nao kurumigi bbepis animelover1984/horahoradev @@ -125,3 +125,92 @@ jfogelman timethrow sarnoud Bojidarist +18928172992817182/gustaf +nixklai +smplayer-dev +Zirro +CrypticSignal +flashdagger +fractalf +frafra +kaz-us +ozburo +rhendric +sdomi +selfisekai +stanoarn +0xA7404A/Aurora +4a1e2y5 +aarubui +chio0hai +cntrl-s +Deer-Spangle +DEvmIb +Grabien/MaximVol +j54vc1bk +mpeter50 +mrpapersonic +pabs3 +staubichsauger +xenova +Yakabuff +zulaport +ehoogeveen-medweb +PilzAdam +zmousm +iw0nderhow +unit193 +TwoThousandHedgehogs/KathrynElrod +Jertzukka +cypheron +Hyeeji +bwildenhain +C0D3D3V +kebianizao +Lapin0t +abdullah-if +DavidSkrundz +mkubecek +raleeper +YuenSzeHong +Sematre +jaller94 +r5d +julien-hadleyjack +git-anony-mouse +mdawar +trassshhub +foghawk +k3ns1n +teridon +mozlima +timendum +ischmidt20 +CreaValix +sian1468 +arkamar +hyano +KiberInfinity +tejing1 +Bricio +lazypete365 +Aniruddh-J +blackgear +CplPwnies +cyberfox1691 +FestplattenSchnitzel +hatienl0i261299 +iphoting +jakeogh +lukasfink1 +lyz-code +marieell +mdpauley +Mipsters +mxmehl +ofkz +P-reducible +pycabbage +regarten +Ronnnny +schn0sch diff --git a/MANIFEST.in b/MANIFEST.in index e43cb87..300ae69 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,5 +5,6 @@ include README.md include completions/*/* include supportedsites.md include hypervideo.1 +include requirements.txt recursive-include devscripts * recursive-include test * diff --git a/bin/hypervideo b/bin/hypervideo old mode 100755 new mode 100644 diff --git a/completions/zsh/_hypervideo b/completions/zsh/_hypervideo new file mode 100644 index 0000000..0a8d491 --- /dev/null +++ b/completions/zsh/_hypervideo @@ -0,0 +1,30 @@ +#compdef hypervideo + +__hypervideo_dl() { + local curcontext="$curcontext" fileopts diropts cur prev + typeset -A opt_args + fileopts="--download-archive|-a|--batch-file|--load-info-json|--load-info|--cookies|--no-cookies" + diropts="--cache-dir" + cur=$words[CURRENT] + case $cur in + :) + _arguments '*: :(::ytfavorites ::ytrecommended ::ytsubscriptions ::ytwatchlater ::ythistory)' + ;; + *) + prev=$words[CURRENT-1] + if [[ ${prev} =~ ${fileopts} ]]; then + _path_files + elif [[ ${prev} =~ ${diropts} ]]; then + _path_files -/ + elif [[ ${prev} == "--remux-video" ]]; then + _arguments '*: :(mp4 mkv)' + elif [[ ${prev} == "--recode-video" ]]; then + _arguments '*: :(mp4 flv ogg webm mkv)' + else + _arguments '*: :(--help --version --ignore-errors --no-abort-on-error --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --no-config-locations --config-locations --flat-playlist --no-flat-playlist --live-from-start --no-live-from-start --wait-for-video --no-wait-for-video --mark-watched --no-mark-watched --no-colors --compat-options --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --geo-bypass-ip-block --playlist-start --playlist-end --playlist-items --match-title --reject-title --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filters --no-match-filter --no-playlist --yes-playlist --age-limit --download-archive --no-download-archive --max-downloads --break-on-existing --break-on-reject --break-per-input --no-break-per-input --skip-playlist-after-errors --include-ads --no-include-ads --concurrent-fragments --limit-rate --throttled-rate --retries --file-access-retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --no-keep-fragments --buffer-size --resize-buffer --no-resize-buffer --http-chunk-size --test --playlist-reverse --no-playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --no-hls-use-mpegts --downloader --downloader-args --batch-file --no-batch-file --id --paths --output --output-na-placeholder --autonumber-size --autonumber-start --restrict-filenames --no-restrict-filenames --windows-filenames --no-windows-filenames --trim-filenames --no-overwrites --force-overwrites --no-force-overwrites --continue --no-continue --part --no-part --mtime --no-mtime --write-description --no-write-description --write-info-json --no-write-info-json --write-annotations --no-write-annotations --write-playlist-metafiles --no-write-playlist-metafiles --clean-info-json --no-clean-info-json --write-comments --no-write-comments --load-info-json --cookies --no-cookies --cookies-from-browser --no-cookies-from-browser --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --no-write-thumbnail --write-all-thumbnails --list-thumbnails --write-link --write-url-link --write-webloc-link --write-desktop-link --quiet --no-warnings --simulate --no-simulate --ignore-no-formats-error --no-ignore-no-formats-error --skip-download --print --print-to-file --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --force-write-archive --newline --no-progress --progress --console-title --progress-template --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --legacy-server-connect --no-check-certificates --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-requests --sleep-interval --max-sleep-interval --sleep-subtitles --format --format-sort --format-sort-force --no-format-sort-force --video-multistreams --no-video-multistreams --audio-multistreams --no-audio-multistreams --all-formats --prefer-free-formats --no-prefer-free-formats --check-formats --check-all-formats --no-check-formats --list-formats --list-formats-as-table --list-formats-old --merge-output-format --allow-unplayable-formats --no-allow-unplayable-formats --write-subs --no-write-subs --write-auto-subs --no-write-auto-subs --all-subs --list-subs --sub-format --sub-langs --username --password --twofactor --netrc --netrc-location --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --remux-video --recode-video --postprocessor-args --keep-video --no-keep-video --post-overwrites --no-post-overwrites --embed-subs --no-embed-subs --embed-thumbnail --no-embed-thumbnail --embed-metadata --no-embed-metadata --embed-chapters --no-embed-chapters --embed-info-json --no-embed-info-json --metadata-from-title --parse-metadata --replace-in-metadata --xattrs --concat-playlist --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --no-exec --exec-before-download --no-exec-before-download --convert-subs --convert-thumbnails --split-chapters --no-split-chapters --remove-chapters --no-remove-chapters --force-keyframes-at-cuts --no-force-keyframes-at-cuts --use-postprocessor --sponsorblock-mark --sponsorblock-remove --sponsorblock-chapter-title --no-sponsorblock --sponsorblock-api --sponskrub --no-sponskrub --sponskrub-cut --no-sponskrub-cut --sponskrub-force --no-sponskrub-force --sponskrub-location --sponskrub-args --extractor-retries --allow-dynamic-mpd --ignore-dynamic-mpd --hls-split-discontinuity --no-hls-split-discontinuity --extractor-args --youtube-include-dash-manifest --youtube-skip-dash-manifest --youtube-include-hls-manifest --youtube-skip-hls-manifest)' + fi + ;; + esac +} + +__hypervideo_dl \ No newline at end of file diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 7a38e40..1e22620 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -9,7 +9,7 @@ import sys sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) -lazy_extractors_filename = sys.argv[1] +lazy_extractors_filename = sys.argv[1] if len(sys.argv) > 1 else 'hypervideo_dl/extractor/lazy_extractors.py' if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) @@ -39,12 +39,6 @@ class {name}({bases}): _module = '{module}' ''' -make_valid_template = ''' - @classmethod - def _make_valid_url(cls): - return {valid_url!r} -''' - def get_base_name(base): if base is InfoExtractor: @@ -61,15 +55,14 @@ def build_lazy_ie(ie, name): bases=', '.join(map(get_base_name, ie.__bases__)), module=ie.__module__) valid_url = getattr(ie, '_VALID_URL', None) + if not valid_url and hasattr(ie, '_make_valid_url'): + valid_url = ie._make_valid_url() if valid_url: s += f' _VALID_URL = {valid_url!r}\n' if not ie._WORKING: s += ' _WORKING = False\n' if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: s += f'\n{getsource(ie.suitable)}' - if hasattr(ie, '_make_valid_url'): - # search extractors - s += make_valid_template.format(valid_url=ie._make_valid_url()) return s diff --git a/devscripts/make_supportedsites.py b/devscripts/make_supportedsites.py index a079406..9bce04b 100644 --- a/devscripts/make_supportedsites.py +++ b/devscripts/make_supportedsites.py @@ -24,11 +24,13 @@ def main(): def gen_ies_md(ies): for ie in ies: ie_md = '**{0}**'.format(ie.IE_NAME) - ie_desc = getattr(ie, 'IE_DESC', None) - if ie_desc is False: + if ie.IE_DESC is False: continue - if ie_desc is not None: + if ie.IE_DESC is not None: ie_md += ': {0}'.format(ie.IE_DESC) + search_key = getattr(ie, 'SEARCH_KEY', None) + if search_key is not None: + ie_md += f'; "{ie.SEARCH_KEY}:" prefix' if not ie.working(): ie_md += ' (Currently broken)' yield ie_md diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 58090d4..8920df1 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -13,12 +13,14 @@ PREFIX = r'''%HYPERVIDEO(1) # NAME -youtube\-dl \- download videos from youtube.com or other video platforms +yt\-dlp \- A youtube-dl fork with additional features and patches # SYNOPSIS **hypervideo** \[OPTIONS\] URL [URL...] +# DESCRIPTION + ''' @@ -33,47 +35,63 @@ def main(): with io.open(README_FILE, encoding='utf-8') as f: readme = f.read() - readme = re.sub(r'(?s)^.*?(?=# DESCRIPTION)', '', readme) - readme = re.sub(r'\s+hypervideo \[OPTIONS\] URL \[URL\.\.\.\]', '', readme) - readme = PREFIX + readme - + readme = filter_excluded_sections(readme) + readme = move_sections(readme) readme = filter_options(readme) with io.open(outfile, 'w', encoding='utf-8') as outf: - outf.write(readme) + outf.write(PREFIX + readme) + + +def filter_excluded_sections(readme): + EXCLUDED_SECTION_BEGIN_STRING = re.escape('') + EXCLUDED_SECTION_END_STRING = re.escape('') + return re.sub( + rf'(?s){EXCLUDED_SECTION_BEGIN_STRING}.+?{EXCLUDED_SECTION_END_STRING}\n', + '', readme) + + +def move_sections(readme): + MOVE_TAG_TEMPLATE = '' + sections = re.findall(r'(?m)^%s$' % ( + re.escape(MOVE_TAG_TEMPLATE).replace(r'\%', '%') % '(.+)'), readme) + + for section_name in sections: + move_tag = MOVE_TAG_TEMPLATE % section_name + if readme.count(move_tag) > 1: + raise Exception(f'There is more than one occurrence of "{move_tag}". This is unexpected') + + sections = re.findall(rf'(?sm)(^# {re.escape(section_name)}.+?)(?=^# )', readme) + if len(sections) < 1: + raise Exception(f'The section {section_name} does not exist') + elif len(sections) > 1: + raise Exception(f'There are multiple occurrences of section {section_name}, this is unhandled') + + readme = readme.replace(sections[0], '', 1).replace(move_tag, sections[0], 1) + return readme def filter_options(readme): - ret = '' - in_options = False - for line in readme.split('\n'): - if line.startswith('# '): - if line[2:].startswith('OPTIONS'): - in_options = True - else: - in_options = False - - if in_options: - if line.lstrip().startswith('-'): - split = re.split(r'\s{2,}', line.lstrip()) - # Description string may start with `-` as well. If there is - # only one piece then it's a description bit not an option. - if len(split) > 1: - option, description = split - split_option = option.split(' ') - - if not split_option[-1].startswith('-'): # metavar - option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]]) - - # Pandoc's definition_lists. See http://pandoc.org/README.html - # for more information. - ret += '\n%s\n: %s\n' % (option, description) - continue - ret += line.lstrip() + '\n' - else: - ret += line + '\n' - - return ret + section = re.search(r'(?sm)^# USAGE AND OPTIONS\n.+?(?=^# )', readme).group(0) + options = '# OPTIONS\n' + for line in section.split('\n')[1:]: + mobj = re.fullmatch(r'''(?x) + \s{4}(?P-(?:,\s|[^\s])+) + (?:\s(?P(?:[^\s]|\s(?!\s))+))? + (\s{2,}(?P.+))? + ''', line) + if not mobj: + options += f'{line.lstrip()}\n' + continue + option, metavar, description = mobj.group('opt', 'meta', 'desc') + + # Pandoc's definition_lists. See http://pandoc.org/README.html + option = f'{option} *{metavar}*' if metavar else option + description = f'{description}\n' if description else '' + options += f'\n{option}\n: {description}' + continue + + return readme.replace(section, options, 1) if __name__ == '__main__': diff --git a/hypervideo_dl/YoutubeDL.py b/hypervideo_dl/YoutubeDL.py old mode 100755 new mode 100644 index 5b5a0d7..276f42d --- a/hypervideo_dl/YoutubeDL.py +++ b/hypervideo_dl/YoutubeDL.py @@ -5,7 +5,6 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib -import copy import datetime import errno import fileinput @@ -28,10 +27,12 @@ import traceback import random import unicodedata +from enum import Enum from string import ascii_letters from .compat import ( compat_basestring, + compat_brotli, compat_get_terminal_size, compat_kwargs, compat_numeric_types, @@ -55,9 +56,7 @@ from .utils import ( DEFAULT_OUTTMPL, determine_ext, determine_protocol, - DOT_DESKTOP_LINK_TEMPLATE, - DOT_URL_LINK_TEMPLATE, - DOT_WEBLOC_LINK_TEMPLATE, + DownloadCancelled, DownloadError, encode_compat_str, encodeFilename, @@ -66,33 +65,46 @@ from .utils import ( ExistingVideoReached, expand_path, ExtractorError, + filter_dict, float_or_none, format_bytes, format_field, + format_decimal_suffix, formatSeconds, GeoRestrictedError, + get_domain, + has_certifi, HEADRequest, + InAdvancePagedList, int_or_none, iri_to_uri, ISO3166Utils, + join_nonempty, LazyList, + LINK_TEMPLATES, locked_file, make_dir, make_HTTPS_handler, MaxDownloadsReached, + merge_headers, network_exceptions, + NO_DEFAULT, + number_of_digits, orderedSet, OUTTMPL_TYPES, PagedList, parse_filesize, PerRequestProxyHandler, platform_name, + Popen, + POSTPROCESS_WHEN, PostProcessingError, preferredencoding, prepend_extension, - process_communicate_or_kill, + ReExtractInfo, register_socks_protocols, RejectedVideoReached, + remove_terminal_sequences, render_table, replace_extension, SameFileError, @@ -107,8 +119,7 @@ from .utils import ( strftime_or_none, subtitles_filename, supports_terminal_sequences, - TERMINAL_SEQUENCES, - ThrottledDownload, + timetuple_from_msec, to_high_limit_path, traverse_obj, try_get, @@ -123,6 +134,7 @@ from .utils import ( YoutubeDLRedirectHandler, ) from .cache import Cache +from .minicurses import format_text from .extractor import ( gen_extractor_classes, get_info_extractor, @@ -139,6 +151,7 @@ from .downloader.rtmp import rtmpdump_version from .postprocessor import ( get_postprocessor, EmbedThumbnailPP, + FFmpegFixupDuplicateMoovPP, FFmpegFixupDurationPP, FFmpegFixupM3u8PP, FFmpegFixupM4aPP, @@ -192,7 +205,12 @@ class YoutubeDL(object): verbose: Print additional info to stdout. quiet: Do not print messages to stdout. no_warnings: Do not print out anything for warnings. - forceprint: A list of templates to force print + forceprint: A dict with keys WHEN mapped to a list of templates to + print to stdout. The allowed keys are video or any of the + items in utils.POSTPROCESS_WHEN. + For compatibility, a single list is also accepted + print_to_file: A dict with keys WHEN (same as forceprint) mapped to + a list of tuples with (template, filename) forceurl: Force printing final URL. (Deprecated) forcetitle: Force printing title. (Deprecated) forceid: Force printing ID. (Deprecated) @@ -208,20 +226,26 @@ class YoutubeDL(object): simulate: Do not download the video files. If unset (or None), simulate only if listsubtitles, listformats or list_thumbnails is used format: Video format code. see "FORMAT SELECTION" for more details. + You can also pass a function. The function takes 'ctx' as + argument and returns the formats to download. + See "build_format_selector" for an implementation allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. ignore_no_formats_error: Ignore "No video formats" error. Usefull for extracting metadata even if the video is not actually available for download (experimental) - format_sort: How to sort the video formats. see "Sorting Formats" - for more details. + format_sort: A list of fields by which to sort the video formats. + See "Sorting Formats" for more details. format_sort_force: Force the given format_sort. see "Sorting Formats" for more details. + prefer_free_formats: Whether to prefer video formats with free containers + over non-free ones of same quality. allow_multiple_video_streams: Allow multiple video streams to be merged into a single file allow_multiple_audio_streams: Allow multiple audio streams to be merged into a single file check_formats Whether to test if the formats are downloadable. - Can be True (check all), False (check none) + Can be True (check all), False (check none), + 'selected' (check selected formats), or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' 'temp' and the keys of OUTTMPL_TYPES (in utils.py) @@ -303,13 +327,18 @@ class YoutubeDL(object): file that is in the archive. break_on_reject: Stop the download process when encountering a video that has been filtered out. + break_per_url: Whether break_on_reject and break_on_existing + should act on each input URL as opposed to for the entire queue cookiefile: File name where cookies should be read from and dumped to - cookiesfrombrowser: A tuple containing the name of the browser and the profile - name/path from where cookies are loaded. - Eg: ('chrome', ) or (vivaldi, 'default') - nocheckcertificate:Do not verify SSL certificates + cookiesfrombrowser: A tuple containing the name of the browser, the profile + name/pathfrom where cookies are loaded, and the name of the + keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + legacyserverconnect: Explicitly allow HTTPS connection to servers that do not + support RFC 5746 secure renegotiation + nocheckcertificate: Do not verify SSL certificates prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. + http_headers: A dictionary of custom headers to be used for all requests proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification on geo-restricted sites. @@ -317,18 +346,21 @@ class YoutubeDL(object): bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well + include_ads: Download ads as well (deprecated) default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. extract_flat: Do not resolve URLs, return the immediate result. Pass in 'in_playlist' to only show this behavior for playlist items. + wait_for_video: If given, wait for scheduled streams to become available. + The value should be a tuple containing the range + (min_secs, max_secs) to wait between retries postprocessors: A list of dictionaries, each with an entry * key: The name of the postprocessor. See hypervideo_dl/postprocessor/__init__.py for a list. - * when: When to run the postprocessor. Can be one of - pre_process|before_dl|post_process|after_move. + * when: When to run the postprocessor. Allowed values are + the entries of utils.POSTPROCESS_WHEN Assumed to be 'post_process' if not given post_hooks: Deprecated - Register a custom postprocessor instead A list of functions that get called as the final step @@ -370,8 +402,7 @@ class YoutubeDL(object): (with status "started" and "finished") if the processing is successful. merge_output_format: Extension to use when merging formats. final_ext: Expected final extension; used to detect when the file was - already downloaded and converted. "merge_output_format" is - replaced by this extension when given + already downloaded and converted fixup: Automatically correct known faults of the file. One of: - "never": do nothing @@ -425,7 +456,7 @@ class YoutubeDL(object): compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort - no-clean-infojson, no-playlist-metafiles, no-keep-subs. + no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json. Refer __init__.py for their implementation progress_template: Dictionary of templates for progress outputs. Allowed keys are 'download', 'postprocess', @@ -435,9 +466,9 @@ class YoutubeDL(object): The following parameters are not used by YoutubeDL itself, they are used by the downloader (see hypervideo_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, - max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl, - noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, - external_downloader_args. + max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, + continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, @@ -460,6 +491,7 @@ class YoutubeDL(object): extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. Eg: {'youtube': {'skip': ['dash', 'hls']}} + mark_watched: Mark videos watched (even with --simulate). Only for YouTube youtube_include_dash_manifest: Deprecated - Use extractor_args instead. If True (default), DASH manifests and related data will be downloaded and processed by extractor. @@ -482,33 +514,33 @@ class YoutubeDL(object): 'track_number', 'disc_number', 'release_year', )) + _format_fields = { + # NB: Keep in sync with the docstring of extractor/common.py + 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', + 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', + 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', + 'preference', 'language', 'language_preference', 'quality', 'source_preference', + 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', + 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' + } _format_selection_exts = { 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, 'video': {'mp4', 'flv', 'webm', '3gp'}, 'storyboards': {'mhtml'}, } - params = None - _ies = {} - _pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} - _printed_messages = set() - _first_webpage_request = True - _download_retcode = None - _num_downloads = None - _playlist_level = 0 - _playlist_urls = set() - _screen_file = None - def __init__(self, params=None, auto_init=True): """Create a FileDownloader object with the given options. @param auto_init Whether to load the default extractors and print header (if verbose). - Set to 'no_verbose_header' to not ptint the header + Set to 'no_verbose_header' to not print the header """ if params is None: params = {} + self.params = params self._ies = {} self._ies_instances = {} - self._pps = {'pre_process': [], 'before_dl': [], 'after_move': [], 'post_process': []} + self._pps = {k: [] for k in POSTPROCESS_WHEN} self._printed_messages = set() self._first_webpage_request = True self._post_hooks = [] @@ -516,14 +548,23 @@ class YoutubeDL(object): self._postprocessor_hooks = [] self._download_retcode = 0 self._num_downloads = 0 - self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] - self._err_file = sys.stderr - self.params = params + self._num_videos = 0 + self._playlist_level = 0 + self._playlist_urls = set() self.cache = Cache(self) windows_enable_vt_mode() - # FIXME: This will break if we ever print color to stdout - self.params['no_color'] = self.params.get('no_color') or not supports_terminal_sequences(self._err_file) + self._out_files = { + 'error': sys.stderr, + 'print': sys.stderr if self.params.get('logtostderr') else sys.stdout, + 'console': None if compat_os_name == 'nt' else next( + filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) + } + self._out_files['screen'] = sys.stderr if self.params.get('quiet') else self._out_files['print'] + self._allow_colors = { + type_: not self.params.get('no_color') and supports_terminal_sequences(self._out_files[type_]) + for type_ in ('screen', 'error') + } if sys.version_info < (3, 6): self.report_warning( @@ -531,10 +572,10 @@ class YoutubeDL(object): if self.params.get('allow_unplayable_formats'): self.report_warning( - f'You have asked for {self._color_text("unplayable formats", "blue")} to be listed/downloaded. ' + f'You have asked for {self._format_err("UNPLAYABLE", self.Styles.EMPHASIS)} formats to be listed/downloaded. ' 'This is a developer option intended for debugging. \n' ' If you experience any issues while using this option, ' - f'{self._color_text("DO NOT", "red")} open a bug report') + f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: @@ -550,8 +591,13 @@ class YoutubeDL(object): check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"') check_deprecated('useid', '--id', '-o "%(id)s.%(ext)s"') - for msg in self.params.get('warnings', []): + for msg in self.params.get('_warnings', []): self.report_warning(msg) + for msg in self.params.get('_deprecation_warnings', []): + self.deprecation_warning(msg) + + if 'list-formats' in self.params.get('compat_opts', []): + self.params['listformats_table'] = False if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: # nooverwrites was unnecessarily changed to overwrites @@ -563,7 +609,14 @@ class YoutubeDL(object): else: self.params['nooverwrites'] = not self.params['overwrites'] - if params.get('bidi_workaround', False): + self.params.setdefault('forceprint', {}) + self.params.setdefault('print_to_file', {}) + + # Compatibility with older syntax + if not isinstance(params['forceprint'], dict): + self.params['forceprint'] = {'video': params['forceprint']} + + if self.params.get('bidi_workaround', False): try: import pty master, slave = pty.openpty() @@ -575,24 +628,23 @@ class YoutubeDL(object): sp_kwargs = dict( stdin=subprocess.PIPE, stdout=slave, - stderr=self._err_file) + stderr=self._out_files['error']) try: - self._output_process = subprocess.Popen( - ['bidiv'] + width_args, **sp_kwargs - ) + self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) except OSError: - self._output_process = subprocess.Popen( - ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) self._output_channel = os.fdopen(master, 'rb') except OSError as ose: if ose.errno == errno.ENOENT: - self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that fribidi is an executable file in one of the directories in your $PATH.') + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') else: raise if (sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] - and not params.get('restrictfilenames', False)): + and not self.params.get('restrictfilenames', False)): # Unicode filesystem API will throw errors (#1474, #13027) self.report_warning( 'Assuming --restrict-filenames since file system encoding ' @@ -604,9 +656,13 @@ class YoutubeDL(object): # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( - None if self.params.get('format') is None + self.params.get('format') if self.params.get('format') in (None, '-') + else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) + # Set http_headers defaults according to std_headers + self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) + self._setup_opener() if auto_init: @@ -614,18 +670,21 @@ class YoutubeDL(object): self.print_debug_header() self.add_default_info_extractors() + hooks = { + 'post_hooks': self.add_post_hook, + 'progress_hooks': self.add_progress_hook, + 'postprocessor_hooks': self.add_postprocessor_hook, + } + for opt, fn in hooks.items(): + for ph in self.params.get(opt, []): + fn(ph) + for pp_def_raw in self.params.get('postprocessors', []): pp_def = dict(pp_def_raw) when = pp_def.pop('when', 'post_process') - pp_class = get_postprocessor(pp_def.pop('key')) - pp = pp_class(self, **compat_kwargs(pp_def)) - self.add_post_processor(pp, when=when) - - for ph in self.params.get('post_hooks', []): - self.add_post_hook(ph) - - for ph in self.params.get('progress_hooks', []): - self.add_progress_hook(ph) + self.add_post_processor( + get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)), + when=when) register_socks_protocols() @@ -633,7 +692,7 @@ class YoutubeDL(object): """Preload the archive, if any is specified""" if fn is None: return False - self.write_debug('Loading archive file %r\n' % fn) + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: @@ -660,7 +719,7 @@ class YoutubeDL(object): ) self.report_warning( 'Long argument string detected. ' - 'Use -- to separate parameters and URLs, like this:\n%s\n' % + 'Use -- to separate parameters and URLs, like this:\n%s' % args_to_str(correct_argv)) def add_info_extractor(self, ie): @@ -713,6 +772,9 @@ class YoutubeDL(object): def add_postprocessor_hook(self, ph): """Add the postprocessing progress hook""" self._postprocessor_hooks.append(ph) + for pps in self._pps.values(): + for pp in pps: + pp.add_progress_hook(ph) def _bidi_workaround(self, message): if not hasattr(self, '_output_channel'): @@ -734,14 +796,24 @@ class YoutubeDL(object): self._printed_messages.add(message) write_string(message, out=out, encoding=self.params.get('encoding')) - def to_stdout(self, message, skip_eol=False, quiet=False): + def to_stdout(self, message, skip_eol=False, quiet=None): """Print message to stdout""" + if quiet is not None: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') + self._write_string( + '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), + self._out_files['print']) + + def to_screen(self, message, skip_eol=False, quiet=None): + """Print message to screen if not in quiet mode""" if self.params.get('logger'): self.params['logger'].debug(message) - elif not quiet or self.params.get('verbose'): - self._write_string( - '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._err_file if quiet else self._screen_file) + return + if (self.params.get('quiet') if quiet is None else quiet) and not self.params.get('verbose'): + return + self._write_string( + '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), + self._out_files['screen']) def to_stderr(self, message, only_once=False): """Print message to stderr""" @@ -749,36 +821,34 @@ class YoutubeDL(object): if self.params.get('logger'): self.params['logger'].error(message) else: - self._write_string('%s\n' % self._bidi_workaround(message), self._err_file, only_once=only_once) + self._write_string('%s\n' % self._bidi_workaround(message), self._out_files['error'], only_once=only_once) + + def _send_console_code(self, code): + if compat_os_name == 'nt' or not self._out_files['console']: + return + self._write_string(code, self._out_files['console']) def to_console_title(self, message): if not self.params.get('consoletitle', False): return + message = remove_terminal_sequences(message) if compat_os_name == 'nt': if ctypes.windll.kernel32.GetConsoleWindow(): # c_wchar_p() might not be necessary if `message` is # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) - elif 'TERM' in os.environ: - self._write_string('\033]0;%s\007' % message, self._screen_file) + else: + self._send_console_code(f'\033]0;{message}\007') def save_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate'): + if not self.params.get('consoletitle') or self.params.get('simulate'): return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Save the title on stack - self._write_string('\033[22;0t', self._screen_file) + self._send_console_code('\033[22;0t') # Save the title on stack def restore_console_title(self): - if not self.params.get('consoletitle', False): - return - if self.params.get('simulate'): + if not self.params.get('consoletitle') or self.params.get('simulate'): return - if compat_os_name != 'nt' and 'TERM' in os.environ: - # Restore the title from stack - self._write_string('\033[23;0t', self._screen_file) + self._send_console_code('\033[23;0t') # Restore the title from stack def __enter__(self): self.save_console_title() @@ -790,14 +860,15 @@ class YoutubeDL(object): if self.params.get('cookiefile') is not None: self.cookiejar.save(ignore_discard=True, ignore_expires=True) - def trouble(self, message=None, tb=None): + def trouble(self, message=None, tb=None, is_error=True): """Determine action to take when a download problem appears. Depending on if the downloader has been configured to ignore download errors or not, this method may throw an exception or not when errors are found, after printing the message. - tb, if given, is additional traceback information. + @param tb If given, is additional traceback information + @param is_error Whether to raise error according to ignorerrors """ if message is not None: self.to_stderr(message) @@ -813,6 +884,8 @@ class YoutubeDL(object): tb = ''.join(tb_data) if tb: self.to_stderr(tb) + if not is_error: + return if not self.params.get('ignoreerrors'): if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: exc_info = sys.exc_info()[1].exc_info @@ -821,15 +894,34 @@ class YoutubeDL(object): raise DownloadError(message, exc_info) self._download_retcode = 1 - def to_screen(self, message, skip_eol=False): - """Print message to stdout if not in quiet mode""" - self.to_stdout( - message, skip_eol, quiet=self.params.get('quiet', False)) - - def _color_text(self, text, color): - if self.params.get('no_color'): - return text - return f'{TERMINAL_SEQUENCES[color.upper()]}{text}{TERMINAL_SEQUENCES["RESET_STYLE"]}' + class Styles(Enum): + HEADERS = 'yellow' + EMPHASIS = 'light blue' + ID = 'green' + DELIM = 'blue' + ERROR = 'red' + WARNING = 'yellow' + SUPPRESS = 'light black' + + def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False): + if test_encoding: + original_text = text + # handle.encoding can be None. See https://github.com/hypervideo/hypervideo/issues/2711 + encoding = self.params.get('encoding') or getattr(handle, 'encoding', None) or 'ascii' + text = text.encode(encoding, 'ignore').decode(encoding) + if fallback is not None and text != original_text: + text = fallback + if isinstance(f, self.Styles): + f = f.value + return format_text(text, f) if allow_colors else text if fallback is None else fallback + + def _format_screen(self, *args, **kwargs): + return self._format_text( + self._out_files['screen'], self._allow_colors['screen'], *args, **kwargs) + + def _format_err(self, *args, **kwargs): + return self._format_text( + self._out_files['error'], self._allow_colors['error'], *args, **kwargs) def report_warning(self, message, only_once=False): ''' @@ -841,14 +933,20 @@ class YoutubeDL(object): else: if self.params.get('no_warnings'): return - self.to_stderr(f'{self._color_text("WARNING:", "yellow")} {message}', only_once) + self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) + + def deprecation_warning(self, message): + if self.params.get('logger') is not None: + self.params['logger'].warning(f'DeprecationWarning: {message}') + else: + self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) - def report_error(self, message, tb=None): + def report_error(self, message, *args, **kwargs): ''' Do the same as trouble, but prefixes the message with 'ERROR:', colored in red if stderr is a tty file. ''' - self.trouble(f'{self._color_text("ERROR:", "red")} {message}', tb) + self.trouble(f'{self._format_err("ERROR:", self.Styles.ERROR)} {message}', *args, **kwargs) def write_debug(self, message, only_once=False): '''Log debug message or Print message to stderr''' @@ -874,13 +972,13 @@ class YoutubeDL(object): except UnicodeEncodeError: self.to_screen('Deleting existing file') - def raise_no_formats(self, info, forced=False): + def raise_no_formats(self, info, forced=False, *, msg=None): has_drm = info.get('__has_drm') - msg = 'This video is DRM protected' if has_drm else 'No video formats found!' - expected = self.params.get('ignore_no_formats_error') - if forced or not expected: + ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg) + msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!' + if forced or not ignored: raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'], - expected=has_drm or expected) + expected=has_drm or ignored or expected) else: self.report_warning(msg) @@ -945,7 +1043,7 @@ class YoutubeDL(object): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBU]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -957,12 +1055,15 @@ class YoutubeDL(object): @staticmethod def _copy_infodict(info_dict): info_dict = dict(info_dict) - for key in ('__original_infodict', '__postprocessors'): - info_dict.pop(key, None) + info_dict.pop('__postprocessors', None) return info_dict - def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None): - """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict """ + def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): + """ Make the outtmpl and info_dict suitable for substitution: ydl.escape_outtmpl(outtmpl) % info_dict + @param sanitize Whether to sanitize the output as a filename. + For backward compatibility, a function can also be passed + """ + info_dict.setdefault('epoch', int(time.time())) # keep epoch consistent once set info_dict = self._copy_infodict(info_dict) @@ -971,19 +1072,20 @@ class YoutubeDL(object): if info_dict.get('duration', None) is not None else None) info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + info_dict['video_autonumber'] = self._num_videos if info_dict.get('resolution') is None: info_dict['resolution'] = self.format_resolution(info_dict, default=None) # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences # of %(field)s to %(field)0Nd for backward compatibility field_size_compat_map = { - 'playlist_index': len(str(info_dict.get('_last_playlist_index') or '')), - 'playlist_autonumber': len(str(info_dict.get('n_entries') or '')), + 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0), + 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), 'autonumber': self.params.get('autonumber_size') or 5, } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBU]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, @@ -991,16 +1093,18 @@ class YoutubeDL(object): # Field is of the form key1.key2... # where keys (except first) can be string, int or slice FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''{field}|{num}'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) INTERNAL_FORMAT_RE = re.compile(r'''(?x) (?P-)? (?P{field}) (?P(?:{math_op}{math_field})*) (?:>(?P.+?))? - (?P(?.*?))? - $'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) + (?P + (?P(?.*?))? + (?:\|(?P.*?))? + )$'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) def _traverse_infodict(k): k = k.split('.') @@ -1046,24 +1150,34 @@ class YoutubeDL(object): na = self.params.get('outtmpl_na_placeholder', 'NA') + def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): + return sanitize_filename(str(value), restricted=restricted, is_id=( + bool(re.search(r'(^|[_.])id(\.|$)', key)) + if 'filename-sanitization' in self.params.get('compat_opts', []) + else NO_DEFAULT)) + + sanitizer = sanitize if callable(sanitize) else filename_sanitizer + sanitize = bool(sanitize) + def _dumpjson_default(obj): if isinstance(obj, (set, LazyList)): return list(obj) - raise TypeError(f'Object of type {type(obj).__name__} is not JSON serializable') + return repr(obj) def create_key(outer_mobj): if not outer_mobj.group('has_key'): return outer_mobj.group(0) key = outer_mobj.group('key') mobj = re.match(INTERNAL_FORMAT_RE, key) - initial_field = mobj.group('fields').split('.')[-1] if mobj else '' - value, default = None, na + initial_field = mobj.group('fields') if mobj else '' + value, replacement, default = None, None, na while mobj: mobj = mobj.groupdict() default = mobj['default'] if mobj['default'] is not None else default value = get_value(mobj) + replacement = mobj['replacement'] if value is None and mobj['alternate']: - mobj = re.match(INTERNAL_FORMAT_RE, mobj['alternate'][1:]) + mobj = re.match(INTERNAL_FORMAT_RE, mobj['remaining'][1:]) else: break @@ -1071,25 +1185,32 @@ class YoutubeDL(object): if fmt == 's' and value is not None and key in field_size_compat_map.keys(): fmt = '0{:d}d'.format(field_size_compat_map[key]) - value = default if value is None else value + value = default if value is None else value if replacement is None else replacement + flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' if fmt[-1] == 'l': # list - delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', ' - value, fmt = delim.join(variadic(value)), str_fmt + delim = '\n' if '#' in flags else ', ' + value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json - value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt + value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt elif fmt[-1] == 'q': # quoted - value, fmt = compat_shlex_quote(str(value)), str_fmt + value = map(str, variadic(value) if '#' in flags else [value]) + value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt elif fmt[-1] == 'B': # bytes value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') value, fmt = value.decode('utf-8', 'ignore'), 's' elif fmt[-1] == 'U': # unicode normalized - opts = outer_mobj.group('conversion') or '' value, fmt = unicodedata.normalize( # "+" = compatibility equivalence, "#" = NFD - 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'), + 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), value), str_fmt + elif fmt[-1] == 'D': # decimal suffix + num_fmt, fmt = fmt[:-1].replace('#', ''), 's' + value = format_decimal_suffix(value, f'%{num_fmt}f%s' if num_fmt else '%d%s', + factor=1024 if '#' in flags else 1000) + elif fmt[-1] == 'S': # filename sanitization + value, fmt = filename_sanitizer(initial_field, value, restricted='#' in flags), str_fmt elif fmt[-1] == 'c': if value: value = str(value)[0] @@ -1106,7 +1227,7 @@ class YoutubeDL(object): # So we convert it to repr first value, fmt = repr(value), str_fmt if fmt[-1] in 'csr': - value = sanitize(initial_field, value) + value = sanitizer(initial_field, value) key = '%s\0%s' % (key.replace('%', '%\0'), outer_mobj.group('format')) TMPL_DICT[key] = value @@ -1118,38 +1239,42 @@ class YoutubeDL(object): outtmpl, info_dict = self.prepare_outtmpl(outtmpl, info_dict, *args, **kwargs) return self.escape_outtmpl(outtmpl) % info_dict - def _prepare_filename(self, info_dict, tmpl_type='default'): + def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): + assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' + if outtmpl is None: + outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) try: - sanitize = lambda k, v: sanitize_filename( - compat_str(v), - restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id' or k.endswith('_id'))) - outtmpl = self._outtmpl_expandpath(self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])) - filename = self.evaluate_outtmpl(outtmpl, info_dict, sanitize) + outtmpl = self._outtmpl_expandpath(outtmpl) + filename = self.evaluate_outtmpl(outtmpl, info_dict, True) + if not filename: + return None - force_ext = OUTTMPL_TYPES.get(tmpl_type) - if filename and force_ext is not None: - filename = replace_extension(filename, force_ext, info_dict.get('ext')) + if tmpl_type in ('', 'temp'): + final_ext, ext = self.params.get('final_ext'), info_dict.get('ext') + if final_ext and ext and final_ext != ext and filename.endswith(f'.{final_ext}'): + filename = replace_extension(filename, ext, final_ext) + elif tmpl_type: + force_ext = OUTTMPL_TYPES[tmpl_type] + if force_ext: + filename = replace_extension(filename, force_ext, info_dict.get('ext')) # https://github.com/blackjack4494/youtube-dlc/issues/85 trim_file_name = self.params.get('trim_file_name', False) if trim_file_name: - fn_groups = filename.rsplit('.') - ext = fn_groups[-1] - sub_ext = '' - if len(fn_groups) > 2: - sub_ext = fn_groups[-2] - filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext])) + no_ext, *ext = filename.rsplit('.', 2) + filename = join_nonempty(no_ext[:trim_file_name], *ext, delim='.') return filename except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None - def prepare_filename(self, info_dict, dir_type='', warn=False): - """Generate the output filename.""" - - filename = self._prepare_filename(info_dict, dir_type or 'default') + def prepare_filename(self, info_dict, dir_type='', *, outtmpl=None, warn=False): + """Generate the output filename""" + if outtmpl: + assert not dir_type, 'outtmpl and dir_type are mutually exclusive' + dir_type = None + filename = self._prepare_filename(info_dict, tmpl_type=dir_type, outtmpl=outtmpl) if not filename and dir_type not in ('', 'temp'): return '' @@ -1266,8 +1391,9 @@ class YoutubeDL(object): temp_id = ie.get_temp_id(url) if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen("[%s] %s: has already been recorded in archive" % ( - ie_key, temp_id)) + self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') + if self.params.get('break_on_existing', False): + raise ExistingVideoReached() break return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) else: @@ -1276,30 +1402,76 @@ class YoutubeDL(object): def __handle_extraction_exceptions(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): - try: - return func(self, *args, **kwargs) - except GeoRestrictedError as e: - msg = e.msg - if e.countries: - msg += '\nThis video is available in %s.' % ', '.join( - map(ISO3166Utils.short2full, e.countries)) - msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' - self.report_error(msg) - except ExtractorError as e: # An error we somewhat expected - self.report_error(compat_str(e), e.format_traceback()) - except ThrottledDownload: - self.to_stderr('\r') - self.report_warning('The download speed is below throttle limit. Re-extracting data') - return wrapper(self, *args, **kwargs) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached, LazyList.IndexError): - raise - except Exception as e: - if self.params.get('ignoreerrors'): - self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - else: + while True: + try: + return func(self, *args, **kwargs) + except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise + except ReExtractInfo as e: + if e.expected: + self.to_screen(f'{e}; Re-extracting data') + else: + self.to_stderr('\r') + self.report_warning(f'{e}; Re-extracting data') + continue + except GeoRestrictedError as e: + msg = e.msg + if e.countries: + msg += '\nThis video is available in %s.' % ', '.join( + map(ISO3166Utils.short2full, e.countries)) + msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' + self.report_error(msg) + except ExtractorError as e: # An error we somewhat expected + self.report_error(str(e), e.format_traceback()) + except Exception as e: + if self.params.get('ignoreerrors'): + self.report_error(str(e), tb=encode_compat_str(traceback.format_exc())) + else: + raise + break return wrapper + def _wait_for_video(self, ie_result): + if (not self.params.get('wait_for_video') + or ie_result.get('_type', 'video') != 'video' + or ie_result.get('formats') or ie_result.get('url')): + return + + format_dur = lambda dur: '%02d:%02d:%02d' % timetuple_from_msec(dur * 1000)[:-1] + last_msg = '' + + def progress(msg): + nonlocal last_msg + self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True) + last_msg = msg + + min_wait, max_wait = self.params.get('wait_for_video') + diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time()) + if diff is None and ie_result.get('live_status') == 'is_upcoming': + diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0) + self.report_warning('Release time of video is not known') + elif (diff or 0) <= 0: + self.report_warning('Video should already be available according to extracted info') + diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf')) + self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now') + + wait_till = time.time() + diff + try: + while True: + diff = wait_till - time.time() + if diff <= 0: + progress('') + raise ReExtractInfo('[wait] Wait period ended', expected=True) + progress(f'[wait] Remaining time until next attempt: {self._format_screen(format_dur(diff), self.Styles.EMPHASIS)}') + time.sleep(1) + except KeyboardInterrupt: + progress('') + raise ReExtractInfo('[wait] Interrupted by user', expected=True) + except BaseException as e: + if not isinstance(e, ReExtractInfo): + self.to_screen('') + raise + @__handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): ie_result = ie.extract(url) @@ -1315,6 +1487,7 @@ class YoutubeDL(object): ie_result.setdefault('original_url', extra_info['original_url']) self.add_default_extra_info(ie_result, ie, url) if process: + self._wait_for_video(ie_result) return self.process_ie_result(ie_result, download, extra_info) else: return ie_result @@ -1324,7 +1497,12 @@ class YoutubeDL(object): self.add_extra_info(ie_result, { 'webpage_url': url, 'original_url': url, - 'webpage_url_basename': url_basename(url), + }) + webpage_url = ie_result.get('webpage_url') + if webpage_url: + self.add_extra_info(ie_result, { + 'webpage_url_basename': url_basename(webpage_url), + 'webpage_url_domain': get_domain(webpage_url), }) if ie is not None: self.add_extra_info(ie_result, { @@ -1358,6 +1536,7 @@ class YoutubeDL(object): info_copy['id'] = ie.get_temp_id(ie_result['url']) self.add_default_extra_info(info_copy, ie, ie_result['url']) self.add_extra_info(info_copy, extra_info) + info_copy, _ = self.pre_process(info_copy) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) @@ -1376,7 +1555,7 @@ class YoutubeDL(object): self.write_debug('Additional URLs: "%s"' % '", "'.join(additional_urls)) ie_result['additional_entries'] = [ self.extract_info( - url, download, extra_info, + url, download, extra_info=extra_info, force_generic_extractor=self.params.get('force_generic_extractor')) for url in additional_urls ] @@ -1400,13 +1579,9 @@ class YoutubeDL(object): if not info: return info - force_properties = dict( - (k, v) for k, v in ie_result.items() if v is not None) - for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'): - if f in force_properties: - del force_properties[f] new_result = info.copy() - new_result.update(force_properties) + new_result.update(filter_dict(ie_result, lambda k, v: ( + v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'}))) # Extracted info may not be a video result (i.e. # info.get('_type', 'video') != video) but rather an url or @@ -1431,6 +1606,7 @@ class YoutubeDL(object): self._playlist_level += 1 self._playlist_urls.add(webpage_url) + self._fill_common_fields(ie_result, False) self._sanitize_thumbnails(ie_result) try: return self.__process_playlist(ie_result, download) @@ -1448,6 +1624,7 @@ class YoutubeDL(object): 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], }) return r @@ -1462,18 +1639,33 @@ class YoutubeDL(object): def _ensure_dir_exists(self, path): return make_dir(path, self.report_error) + @staticmethod + def _playlist_infodict(ie_result, **kwargs): + return { + **ie_result, + 'playlist': ie_result.get('title') or ie_result.get('id'), + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': 0, + **kwargs, + } + def __process_playlist(self, ie_result, download): # We process each entry in the playlist playlist = ie_result.get('title') or ie_result.get('id') self.to_screen('[download] Downloading playlist: %s' % playlist) if 'entries' not in ie_result: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist('There are no entries') + + MissingEntry = object() incomplete_entries = bool(ie_result.get('requested_entries')) if incomplete_entries: - def fill_missing_entries(entries, indexes): - ret = [None] * max(*indexes) - for i, entry in zip(indexes, entries): + def fill_missing_entries(entries, indices): + ret = [MissingEntry] * max(indices) + for i, entry in zip(indices, entries): ret[i - 1] = entry return ret ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) @@ -1500,23 +1692,27 @@ class YoutubeDL(object): playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) ie_entries = ie_result['entries'] - msg = ( - 'Downloading %d videos' if not isinstance(ie_entries, list) - else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) - if isinstance(ie_entries, list): + playlist_count = len(ie_entries) + msg = f'Collected {playlist_count} videos; downloading %d of them' + ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count + def get_entry(i): return ie_entries[i - 1] else: - if not isinstance(ie_entries, PagedList): + msg = 'Downloading %d videos' + if not isinstance(ie_entries, (PagedList, LazyList)): ie_entries = LazyList(ie_entries) + elif isinstance(ie_entries, InAdvancePagedList): + if ie_entries._pagesize == 1: + playlist_count = ie_entries._pagecount def get_entry(i): return YoutubeDL.__handle_extraction_exceptions( lambda self, i: ie_entries[i - 1] )(self, i) - entries = [] + entries, broken = [], False items = playlistitems if playlistitems is not None else itertools.count(playliststart) for i in items: if i == 0: @@ -1526,11 +1722,11 @@ class YoutubeDL(object): entry = None try: entry = get_entry(i) - if entry is None: + if entry is MissingEntry: raise EntryNotInPlaylist() except (IndexError, EntryNotInPlaylist): if incomplete_entries: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist(f'Entry {i} cannot be found') elif not playlistitems: break entries.append(entry) @@ -1538,6 +1734,7 @@ class YoutubeDL(object): if entry is not None: self._match_entry(entry, incomplete=True, silent=True) except (ExistingVideoReached, RejectedVideoReached): + broken = True break ie_result['entries'] = entries @@ -1548,23 +1745,22 @@ class YoutubeDL(object): if entry is not None] n_entries = len(entries) - if not playlistitems and (playliststart or playlistend): + if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): + ie_result['playlist_count'] = n_entries + + if not playlistitems and (playliststart != 1 or playlistend): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems - if self.params.get('allow_playlist_files', True): - ie_copy = { - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': 0, - } - ie_copy.update(dict(ie_result)) - - if self._write_info_json('playlist', ie_result, - self.prepare_filename(ie_copy, 'pl_infojson')) is None: + _infojson_written = False + write_playlist_files = self.params.get('allow_playlist_files', True) + if write_playlist_files and self.params.get('list_thumbnails'): + self.list_thumbnails(ie_result) + if write_playlist_files and not self.params.get('simulate'): + ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries) + _infojson_written = self._write_info_json( + 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) + if _infojson_written is None: return if self._write_description('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_description')) is None: @@ -1594,6 +1790,7 @@ class YoutubeDL(object): extra = { 'n_entries': n_entries, '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), + 'playlist_count': ie_result.get('playlist_count'), 'playlist_index': playlist_index, 'playlist_autonumber': i, 'playlist': playlist, @@ -1604,6 +1801,7 @@ class YoutubeDL(object): 'extractor': ie_result['extractor'], 'webpage_url': ie_result['webpage_url'], 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), 'extractor_key': ie_result['extractor_key'], } @@ -1617,10 +1815,17 @@ class YoutubeDL(object): self.report_error( 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) break - # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) + + # Write the updated info to json + if _infojson_written is True and self._write_info_json( + 'updated playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None: + return + + ie_result = self.run_all_pps('playlist', ie_result) + self.to_screen(f'[download] Finished downloading playlist: {playlist}') return ie_result @__handle_extraction_exceptions @@ -1664,15 +1869,21 @@ class YoutubeDL(object): '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, + '~=': lambda attr, value: value.search(attr) is not None } str_operator_rex = re.compile(r'''(?x)\s* (?P[a-zA-Z0-9._-]+)\s* - (?P!\s*)?(?P%s)(?P\s*\?)?\s* - (?P[a-zA-Z0-9._-]+)\s* + (?P!\s*)?(?P%s)\s*(?P\?\s*)? + (?P["'])? + (?P(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+)) + (?(quote)(?P=quote))\s* ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.fullmatch(filter_spec) if m: - comparison_value = m.group('value') + if m.group('op') == '~=': + comparison_value = re.compile(m.group('value')) + else: + comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value')) str_op = STR_OPERATORS[m.group('op')] if m.group('negation'): op = lambda attr, value: not str_op(attr, value) @@ -1689,6 +1900,29 @@ class YoutubeDL(object): return op(actual_value, comparison_value) return _filter + def _check_formats(self, formats): + for f in formats: + self.to_screen('[info] Testing format %s' % f['format_id']) + path = self.get_output_path('temp') + if not self._ensure_dir_exists(f'{path}/'): + continue + temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None) + temp_file.close() + try: + success, _ = self.dl(temp_file.name, f, test=True) + except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + success = False + finally: + if os.path.exists(temp_file.name): + try: + os.remove(temp_file.name) + except OSError: + self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) + if success: + yield f + else: + self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + def _default_format_spec(self, info_dict, download=True): def can_merge(): @@ -1728,7 +1962,7 @@ class YoutubeDL(object): allow_multiple_streams = {'audio': self.params.get('allow_multiple_audio_streams', False), 'video': self.params.get('allow_multiple_video_streams', False)} - check_formats = self.params.get('check_formats') + check_formats = self.params.get('check_formats') == 'selected' def _parse_filter(tokens): filter_parts = [] @@ -1873,9 +2107,9 @@ class YoutubeDL(object): 'format_id': '+'.join(filtered('format_id')), 'ext': output_ext, 'protocol': '+'.join(map(determine_protocol, formats_info)), - 'language': '+'.join(orderedSet(filtered('language'))), - 'format_note': '+'.join(orderedSet(filtered('format_note'))), - 'filesize_approx': sum(filtered('filesize', 'filesize_approx')), + 'language': '+'.join(orderedSet(filtered('language'))) or None, + 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None, + 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None, 'tbr': sum(filtered('tbr', 'vbr', 'abr')), } @@ -1885,6 +2119,7 @@ class YoutubeDL(object): 'height': the_only_video.get('height'), 'resolution': the_only_video.get('resolution') or self.format_resolution(the_only_video), 'fps': the_only_video.get('fps'), + 'dynamic_range': the_only_video.get('dynamic_range'), 'vcodec': the_only_video.get('vcodec'), 'vbr': the_only_video.get('vbr'), 'stretched_ratio': the_only_video.get('stretched_ratio'), @@ -1903,26 +2138,7 @@ class YoutubeDL(object): if not check_formats: yield from formats return - for f in formats: - self.to_screen('[info] Testing format %s' % f['format_id']) - temp_file = tempfile.NamedTemporaryFile( - suffix='.tmp', delete=False, - dir=self.get_output_path('temp') or None) - temp_file.close() - try: - success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: - success = False - finally: - if os.path.exists(temp_file.name): - try: - os.remove(temp_file.name) - except OSError: - self.report_warning('Unable to delete temporary file "%s"' % temp_file.name) - if success: - yield f - else: - self.to_screen('[info] Unable to download format %s. Skipping...' % f['format_id']) + yield from self._check_formats(formats) def _build_selector_function(selector): if isinstance(selector, list): # , @@ -1950,8 +2166,7 @@ class YoutubeDL(object): selector_1, selector_2 = map(_build_selector_function, selector.selector) def selector_function(ctx): - for pair in itertools.product( - selector_1(copy.deepcopy(ctx)), selector_2(copy.deepcopy(ctx))): + for pair in itertools.product(selector_1(ctx), selector_2(ctx)): yield _merge(pair) elif selector.type == SINGLE: # atom @@ -1960,7 +2175,7 @@ class YoutubeDL(object): # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector if format_spec == 'all': def selector_function(ctx): - yield from _check_formats(ctx['formats']) + yield from _check_formats(ctx['formats'][::-1]) elif format_spec == 'mergeall': def selector_function(ctx): formats = list(_check_formats(ctx['formats'])) @@ -1972,7 +2187,7 @@ class YoutubeDL(object): yield merged_format else: - format_fallback, format_reverse, format_idx = False, True, 1 + format_fallback, seperate_fallback, format_reverse, format_idx = False, None, True, 1 mobj = re.match( r'(?Pbest|worst|b|w)(?Pvideo|audio|v|a)?(?P\*)?(?:\.(?P[1-9]\d*))?$', format_spec) @@ -1999,6 +2214,7 @@ class YoutubeDL(object): filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' elif format_spec in self._format_selection_exts['video']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') != 'none' and f.get('vcodec') != 'none' + seperate_fallback = lambda f: f.get('ext') == format_spec and f.get('vcodec') != 'none' elif format_spec in self._format_selection_exts['storyboards']: filter_f = lambda f: f.get('ext') == format_spec and f.get('acodec') == 'none' and f.get('vcodec') == 'none' else: @@ -2007,11 +2223,15 @@ class YoutubeDL(object): def selector_function(ctx): formats = list(ctx['formats']) matches = list(filter(filter_f, formats)) if filter_f is not None else formats - if format_fallback and ctx['incomplete_formats'] and not matches: - # for extractors with incomplete formats (audio only (soundcloud) - # or video only (imgur)) best/worst will fallback to - # best/worst {video,audio}-only format - matches = formats + if not matches: + if format_fallback and ctx['incomplete_formats']: + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) best/worst will fallback to + # best/worst {video,audio}-only format + matches = formats + elif seperate_fallback and not ctx['has_merged_format']: + # for compatibility with youtube-dl when there is no pre-merged format + matches = list(filter(seperate_fallback, formats)) matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) try: yield matches[format_idx - 1] @@ -2021,7 +2241,7 @@ class YoutubeDL(object): filters = [self._build_format_filter(f) for f in selector.filters] def final_selector(ctx): - ctx_copy = copy.deepcopy(ctx) + ctx_copy = dict(ctx) for _filter in filters: ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) return selector_function(ctx_copy) @@ -2057,11 +2277,7 @@ class YoutubeDL(object): return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): - res = std_headers.copy() - - add_headers = info_dict.get('http_headers') - if add_headers: - res.update(add_headers) + res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) cookies = self._calc_cookies(info_dict) if cookies: @@ -2079,51 +2295,106 @@ class YoutubeDL(object): self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') + def _sort_thumbnails(self, thumbnails): + thumbnails.sort(key=lambda t: ( + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', + t.get('url'))) + def _sanitize_thumbnails(self, info_dict): thumbnails = info_dict.get('thumbnails') if thumbnails is None: thumbnail = info_dict.get('thumbnail') if thumbnail: info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] - if thumbnails: - thumbnails.sort(key=lambda t: ( - t.get('preference') if t.get('preference') is not None else -1, - t.get('width') if t.get('width') is not None else -1, - t.get('height') if t.get('height') is not None else -1, - t.get('id') if t.get('id') is not None else '', - t.get('url'))) - - def thumbnail_tester(): - def test_thumbnail(t): - self.to_screen(f'[info] Testing thumbnail {t["id"]}') - try: - self.urlopen(HEADRequest(t['url'])) - except network_exceptions as err: - self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') - return False - return True - return test_thumbnail - - for i, t in enumerate(thumbnails): - if t.get('id') is None: - t['id'] = '%d' % i - if t.get('width') and t.get('height'): - t['resolution'] = '%dx%d' % (t['width'], t['height']) - t['url'] = sanitize_url(t['url']) - - if self.params.get('check_formats'): - info_dict['thumbnails'] = LazyList(filter(thumbnail_tester(), thumbnails[::-1])).reverse() - else: - info_dict['thumbnails'] = thumbnails + if not thumbnails: + return + + def check_thumbnails(thumbnails): + for t in thumbnails: + self.to_screen(f'[info] Testing thumbnail {t["id"]}') + try: + self.urlopen(HEADRequest(t['url'])) + except network_exceptions as err: + self.to_screen(f'[info] Unable to connect to thumbnail {t["id"]} URL {t["url"]!r} - {err}. Skipping...') + continue + yield t + + self._sort_thumbnails(thumbnails) + for i, t in enumerate(thumbnails): + if t.get('id') is None: + t['id'] = '%d' % i + if t.get('width') and t.get('height'): + t['resolution'] = '%dx%d' % (t['width'], t['height']) + t['url'] = sanitize_url(t['url']) + + if self.params.get('check_formats') is True: + info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True) + else: + info_dict['thumbnails'] = thumbnails + + def _fill_common_fields(self, info_dict, is_video=True): + # TODO: move sanitization here + if is_video: + # playlists are allowed to lack "title" + info_dict['fulltitle'] = info_dict.get('title') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result', + video_id=info_dict['id'], ie=info_dict['extractor']) + elif not info_dict.get('title'): + self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') + info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}' + + if info_dict.get('duration') is not None: + info_dict['duration_string'] = formatSeconds(info_dict['duration']) + + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ('modified_timestamp', 'modified_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass + + live_keys = ('is_live', 'was_live') + live_status = info_dict.get('live_status') + if live_status is None: + for key in live_keys: + if info_dict.get(key) is False: + continue + if info_dict.get(key): + live_status = key + break + if all(info_dict.get(key) is False for key in live_keys): + live_status = 'not_live' + if live_status: + info_dict['live_status'] = live_status + for key in live_keys: + if info_dict.get(key) is None: + info_dict[key] = (live_status == key) + + # Auto generate title fields corresponding to the *_number fields when missing + # in order to always have clean titles. This is very common for TV series. + for field in ('chapter', 'season', 'episode'): + if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' + self._num_videos += 1 if 'id' not in info_dict: - raise ExtractorError('Missing "id" field in extractor result') - if 'title' not in info_dict: - raise ExtractorError('Missing "title" field in extractor result', - video_id=info_dict['id'], ie=info_dict['extractor']) + raise ExtractorError('Missing "id" field in extractor result', ie=info_dict['extractor']) + elif not info_dict.get('id'): + raise ExtractorError('Extractor failed to obtain "id"', ie=info_dict['extractor']) def report_force_conversion(field, field_not, conversion): self.report_warning( @@ -2147,6 +2418,8 @@ class YoutubeDL(object): sanitize_string_field(info_dict, 'id') sanitize_numeric_fields(info_dict) + if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): + self.report_warning('"duration" field is negative, there is an error in extractor') if 'playlist' not in info_dict: # It isn't part of a playlist @@ -2165,44 +2438,7 @@ class YoutubeDL(object): if info_dict.get('display_id') is None and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('duration') is not None: - info_dict['duration_string'] = formatSeconds(info_dict['duration']) - - for ts_key, date_key in ( - ('timestamp', 'upload_date'), - ('release_timestamp', 'release_date'), - ): - if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) - info_dict[date_key] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass - - live_keys = ('is_live', 'was_live') - live_status = info_dict.get('live_status') - if live_status is None: - for key in live_keys: - if info_dict.get(key) is False: - continue - if info_dict.get(key): - live_status = key - break - if all(info_dict.get(key) is False for key in live_keys): - live_status = 'not_live' - if live_status: - info_dict['live_status'] = live_status - for key in live_keys: - if info_dict.get(key) is None: - info_dict[key] = (live_status == key) - - # Auto generate title fields corresponding to the *_number fields when missing - # in order to always have clean titles. This is very common for TV series. - for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): - info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + self._fill_common_fields(info_dict) for cc_kind in ('subtitles', 'automatic_captions'): cc = info_dict.get(cc_kind) @@ -2220,7 +2456,6 @@ class YoutubeDL(object): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available formats = [info_dict] @@ -2230,6 +2465,21 @@ class YoutubeDL(object): info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] + if info_dict['__has_drm'] and all( + f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + 'This video is DRM protected and only images are available for download. ' + 'Use --list-formats to see them') + + get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) + if not get_from_start: + info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + if info_dict.get('is_live') and formats: + formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] + if get_from_start and not formats: + self.raise_no_formats(info_dict, msg=( + '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' + 'If you want to download from the current time, use --no-live-from-start')) if not formats: self.raise_no_formats(info_dict) @@ -2292,6 +2542,10 @@ class YoutubeDL(object): format['resolution'] = self.format_resolution(format, default=None) if format.get('dynamic_range') is None and format.get('vcodec') != 'none': format['dynamic_range'] = 'SDR' + if (info_dict.get('duration') and format.get('tbr') + and not format.get('filesize') and not format.get('filesize_approx')): + format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + # Add HTTP headers, so that external programs can use them from the # json output full_format_info = info_dict.copy() @@ -2301,7 +2555,8 @@ class YoutubeDL(object): if '__x_forwarded_for_ip' in info_dict: del info_dict['__x_forwarded_for_ip'] - # TODO Central sorting goes here + if self.params.get('check_formats') is True: + formats = LazyList(self._check_formats(formats[::-1]), reverse=True) if not formats or formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them @@ -2312,20 +2567,27 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict) + if self._match_entry(info_dict, incomplete=self._format_fields) is not None: + return info_dict + + self.post_extract(info_dict) + info_dict, _ = self.pre_process(info_dict, 'after_filter') + + # The pre-processors may have modified the formats + formats = info_dict.get('formats', [info_dict]) + + list_only = self.params.get('simulate') is None and ( + self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + interactive_format_selection = not list_only and self.format_selector == '-' if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) - if self.params.get('listformats'): - if not info_dict.get('formats') and not info_dict.get('url'): - self.to_screen('%s has no formats' % info_dict['id']) - else: - self.list_formats(info_dict) if self.params.get('listsubtitles'): if 'automatic_captions' in info_dict: self.list_subtitles( info_dict['id'], automatic_captions, 'automatic captions') self.list_subtitles(info_dict['id'], subtitles, 'subtitles') - list_only = self.params.get('simulate') is None and ( - self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + if self.params.get('listformats') or interactive_format_selection: + self.list_formats(info_dict) if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) @@ -2337,55 +2599,72 @@ class YoutubeDL(object): self.write_debug('Default format spec: %s' % req_format) format_selector = self.build_format_selector(req_format) - # While in format selection we may need to have an access to the original - # format set in order to calculate some metrics or do some processing. - # For now we need to be able to guess whether original formats provided - # by extractor are incomplete or not (i.e. whether extractor provides only - # video-only or audio-only formats) for proper formats selection for - # extractors with such incomplete formats (see - # https://github.com/ytdl-org/youtube-dl/pull/5556). - # Since formats may be filtered during format selection and may not match - # the original formats the results may be incorrect. Thus original formats - # or pre-calculated metrics should be passed to format selection routines - # as well. - # We will pass a context object containing all necessary additional data - # instead of just formats. - # This fixes incorrect format selection issue (see - # https://github.com/ytdl-org/youtube-dl/issues/10083). - incomplete_formats = ( - # All formats are video-only or - all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) - # all formats are audio-only - or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) - - ctx = { - 'formats': formats, - 'incomplete_formats': incomplete_formats, - } + while True: + if interactive_format_selection: + req_format = input( + self._format_screen('\nEnter format selector: ', self.Styles.EMPHASIS)) + try: + format_selector = self.build_format_selector(req_format) + except SyntaxError as err: + self.report_error(err, tb=False, is_error=False) + continue + + formats_to_download = list(format_selector({ + 'formats': formats, + 'has_merged_format': any('none' not in (f.get('acodec'), f.get('vcodec')) for f in formats), + 'incomplete_formats': ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) + # all formats are audio-only + or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)), + })) + if interactive_format_selection and not formats_to_download: + self.report_error('Requested format is not available', tb=False, is_error=False) + continue + break - formats_to_download = list(format_selector(ctx)) if not formats_to_download: if not self.params.get('ignore_no_formats_error'): - raise ExtractorError('Requested format is not available', expected=True, - video_id=info_dict['id'], ie=info_dict['extractor']) - else: - self.report_warning('Requested format is not available') - # Process what we can, even without any available formats. - self.process_info(dict(info_dict)) - elif download: - self.to_screen( - '[info] %s: Downloading %d format(s): %s' % ( - info_dict['id'], len(formats_to_download), - ", ".join([f['format_id'] for f in formats_to_download]))) - for fmt in formats_to_download: - new_info = dict(info_dict) - # Save a reference to the original info_dict so that it can be modified in process_info if needed - new_info['__original_infodict'] = info_dict + raise ExtractorError( + 'Requested format is not available. Use --list-formats for a list of available formats', + expected=True, video_id=info_dict['id'], ie=info_dict['extractor']) + self.report_warning('Requested format is not available') + # Process what we can, even without any available formats. + formats_to_download = [{}] + + best_format = formats_to_download[-1] + if download: + if best_format: + self.to_screen( + f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): ' + + ', '.join([f['format_id'] for f in formats_to_download])) + max_downloads_reached = False + for i, fmt in enumerate(formats_to_download): + formats_to_download[i] = new_info = self._copy_infodict(info_dict) new_info.update(fmt) - self.process_info(new_info) - # We update the info dict with the best quality format (backwards compatibility) - if formats_to_download: - info_dict.update(formats_to_download[-1]) + try: + self.process_info(new_info) + except MaxDownloadsReached: + max_downloads_reached = True + # Remove copied info + for key, val in tuple(new_info.items()): + if info_dict.get(key) == val: + new_info.pop(key) + if max_downloads_reached: + break + + write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download) + assert write_archive.issubset({True, False, 'ignore'}) + if True in write_archive and False not in write_archive: + self.record_download_archive(info_dict) + + info_dict['requested_downloads'] = formats_to_download + info_dict = self.run_all_pps('after_video', info_dict) + if max_downloads_reached: + raise MaxDownloadsReached() + + # We update the info dict with the selected best quality format (backwards compatibility) + info_dict.update(best_format) return info_dict def process_subtitles(self, video_id, normal_subtitles, automatic_captions): @@ -2411,12 +2690,15 @@ class YoutubeDL(object): # given in subtitleslangs. See https://github.com/hypervideo/hypervideo/issues/1041 requested_langs = [] for lang_re in self.params.get('subtitleslangs'): - if lang_re == 'all': - requested_langs.extend(all_sub_langs) - continue discard = lang_re[0] == '-' if discard: lang_re = lang_re[1:] + if lang_re == 'all': + if discard: + requested_langs = [] + else: + requested_langs.extend(all_sub_langs) + continue current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) if discard: for lang in current_langs: @@ -2456,6 +2738,34 @@ class YoutubeDL(object): subs[lang] = f return subs + def _forceprint(self, key, info_dict): + if info_dict is None: + return + info_copy = info_dict.copy() + info_copy['formats_table'] = self.render_formats_table(info_dict) + info_copy['thumbnails_table'] = self.render_thumbnails_table(info_dict) + info_copy['subtitles_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('subtitles')) + info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) + + def format_tmpl(tmpl): + mobj = re.match(r'\w+(=?)$', tmpl) + if mobj and mobj.group(1): + return f'{tmpl[:-1]} = %({tmpl[:-1]})r' + elif mobj: + return f'%({tmpl})s' + return tmpl + + for tmpl in self.params['forceprint'].get(key, []): + self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) + + for tmpl, file_tmpl in self.params['print_to_file'].get(key, []): + filename = self.prepare_filename(info_dict, outtmpl=file_tmpl) + tmpl = format_tmpl(tmpl) + self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') + if self._ensure_dir_exists(filename): + with io.open(filename, 'a', encoding='utf-8') as f: + f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') + def __forced_printings(self, info_dict, filename, incomplete): def print_mandatory(field, actual_field=None): if actual_field is None: @@ -2475,18 +2785,14 @@ class YoutubeDL(object): if info_dict.get('requested_formats') is not None: # For RTMP URLs, also include the playpath info_dict['urls'] = '\n'.join(f['url'] + f.get('play_path', '') for f in info_dict['requested_formats']) - elif 'url' in info_dict: + elif info_dict.get('url'): info_dict['urls'] = info_dict['url'] + info_dict.get('play_path', '') - if self.params.get('forceprint') or self.params.get('forcejson'): + if (self.params.get('forcejson') + or self.params['forceprint'].get('video') + or self.params['print_to_file'].get('video')): self.post_extract(info_dict) - for tmpl in self.params.get('forceprint', []): - mobj = re.match(r'\w+(=?)$', tmpl) - if mobj and mobj.group(1): - tmpl = f'{tmpl[:-1]} = %({tmpl[:-1]})s' - elif mobj: - tmpl = '%({})s'.format(tmpl) - self.to_stdout(self.evaluate_outtmpl(tmpl, info_dict)) + self._forceprint('video', info_dict) print_mandatory('title') print_mandatory('id') @@ -2524,33 +2830,44 @@ class YoutubeDL(object): if not test: for ph in self._progress_hooks: fd.add_progress_hook(ph) - urls = '", "'.join([f['url'] for f in info.get('requested_formats', [])] or [info['url']]) + urls = '", "'.join( + (f['url'].split(',')[0] + ',' if f['url'].startswith('data:') else f['url']) + for f in info.get('requested_formats', []) or [info]) self.write_debug('Invoking downloader on "%s"' % urls) - new_info = copy.deepcopy(self._copy_infodict(info)) + # Note: Ideally info should be a deep-copied so that hooks cannot modify it. + # But it may contain objects that are not deep-copyable + new_info = self._copy_infodict(info) if new_info.get('http_headers') is None: new_info['http_headers'] = self._calc_headers(new_info) return fd.download(name, new_info, subtitle) - def process_info(self, info_dict): - """Process a single resolved IE result.""" + def existing_file(self, filepaths, *, default_overwrite=True): + existing_files = list(filter(os.path.exists, orderedSet(filepaths))) + if existing_files and not self.params.get('overwrites', default_overwrite): + return existing_files[0] - assert info_dict.get('_type', 'video') == 'video' + for file in existing_files: + self.report_file_delete(file) + os.remove(file) + return None - max_downloads = self.params.get('max_downloads') - if max_downloads is not None: - if self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + def process_info(self, info_dict): + """Process a single resolved IE result. (Modifies it in-place)""" - # TODO: backward compatibility, to be removed - info_dict['fulltitle'] = info_dict['title'] + assert info_dict.get('_type', 'video') == 'video' + original_infodict = info_dict if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] + # This is mostly just for backward compatibility of process_info + # As a side-effect, this allows for format-specific filters if self._match_entry(info_dict) is not None: + info_dict['__write_download_archive'] = 'ignore' return + # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) self._num_downloads += 1 @@ -2563,9 +2880,7 @@ class YoutubeDL(object): self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) if self.params.get('simulate'): - if self.params.get('force_write_download_archive', False): - self.record_download_archive(info_dict) - # Do nothing else if in simulate mode + info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') return if full_filename is None: @@ -2593,6 +2908,8 @@ class YoutubeDL(object): infofn = self.prepare_filename(info_dict, 'infojson') _infojson_written = self._write_info_json('video', info_dict, infofn) if _infojson_written: + info_dict['infojson_filename'] = infofn + # For backward compatibility, even though it was a private field info_dict['__infojson_filename'] = infofn elif _infojson_written is None: return @@ -2620,91 +2937,79 @@ class YoutubeDL(object): return # Write internet shortcut files - url_link = webloc_link = desktop_link = False - if self.params.get('writelink', False): - if sys.platform == "darwin": # macOS. - webloc_link = True - elif sys.platform.startswith("linux"): - desktop_link = True - else: # if sys.platform in ['win32', 'cygwin']: - url_link = True - if self.params.get('writeurllink', False): - url_link = True - if self.params.get('writewebloclink', False): - webloc_link = True - if self.params.get('writedesktoplink', False): - desktop_link = True - - if url_link or webloc_link or desktop_link: - if 'webpage_url' not in info_dict: - self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') - return - ascii_url = iri_to_uri(info_dict['webpage_url']) - - def _write_link_file(extension, template, newline, embed_filename): - linkfn = replace_extension(full_filename, extension, info_dict.get('ext')) + def _write_link_file(link_type): + url = try_get(info_dict['webpage_url'], iri_to_uri) + if not url: + self.report_warning( + f'Cannot write internet shortcut file because the actual URL of "{info_dict["webpage_url"]}" is unknown') + return True + linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) + if not self._ensure_dir_exists(encodeFilename(linkfn)): + return False if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): - self.to_screen('[info] Internet shortcut is already present') - else: - try: - self.to_screen('[info] Writing internet shortcut to: ' + linkfn) - with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', newline=newline) as linkfile: - template_vars = {'url': ascii_url} - if embed_filename: - template_vars['filename'] = linkfn[:-(len(extension) + 1)] - linkfile.write(template % template_vars) - except (OSError, IOError): - self.report_error('Cannot write internet shortcut ' + linkfn) - return False + self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') + return True + try: + self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') + with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: + template_vars = {'url': url} + if link_type == 'desktop': + template_vars['filename'] = linkfn[:-(len(link_type) + 1)] + linkfile.write(LINK_TEMPLATES[link_type] % template_vars) + except (OSError, IOError): + self.report_error(f'Cannot write internet shortcut {linkfn}') + return False return True - if url_link: - if not _write_link_file('url', DOT_URL_LINK_TEMPLATE, '\r\n', embed_filename=False): - return - if webloc_link: - if not _write_link_file('webloc', DOT_WEBLOC_LINK_TEMPLATE, '\n', embed_filename=False): - return - if desktop_link: - if not _write_link_file('desktop', DOT_DESKTOP_LINK_TEMPLATE, '\n', embed_filename=True): + write_links = { + 'url': self.params.get('writeurllink'), + 'webloc': self.params.get('writewebloclink'), + 'desktop': self.params.get('writedesktoplink'), + } + if self.params.get('writelink'): + link_type = ('webloc' if sys.platform == 'darwin' + else 'desktop' if sys.platform.startswith('linux') + else 'url') + write_links[link_type] = True + + if any(should_write and not _write_link_file(link_type) + for link_type, should_write in write_links.items()): + return + + def replace_info_dict(new_info): + nonlocal info_dict + if new_info == info_dict: return + info_dict.clear() + info_dict.update(new_info) try: - info_dict, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + replace_info_dict(new_info) except PostProcessingError as err: self.report_error('Preprocessing: %s' % str(err)) return - must_record_download_archive = False - if self.params.get('skip_download', False): + if self.params.get('skip_download'): info_dict['filepath'] = temp_filename info_dict['__finaldir'] = os.path.dirname(os.path.abspath(encodeFilename(full_filename))) info_dict['__files_to_move'] = files_to_move - info_dict = self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict) + replace_info_dict(self.run_pp(MoveFilesAfterDownloadPP(self, False), info_dict)) + info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') else: # Download info_dict.setdefault('__postprocessors', []) try: - def existing_file(*filepaths): + def existing_video_file(*filepaths): ext = info_dict.get('ext') - final_ext = self.params.get('final_ext', ext) - existing_files = [] - for file in orderedSet(filepaths): - if final_ext != ext: - converted = replace_extension(file, final_ext, ext) - if os.path.exists(encodeFilename(converted)): - existing_files.append(converted) - if os.path.exists(encodeFilename(file)): - existing_files.append(file) - - if not existing_files or self.params.get('overwrites', False): - for file in orderedSet(existing_files): - self.report_file_delete(file) - os.remove(encodeFilename(file)) - return None - - info_dict['ext'] = os.path.splitext(existing_files[0])[1][1:] - return existing_files[0] + converted = lambda file: replace_extension(file, self.params.get('final_ext') or ext, ext) + file = self.existing_file(itertools.chain(*zip(map(converted, filepaths), filepaths)), + default_overwrite=False) + if file: + info_dict['ext'] = os.path.splitext(file)[1][1:] + return file success = True if info_dict.get('requested_formats') is not None: @@ -2758,30 +3063,39 @@ class YoutubeDL(object): # Ensure filename always has a correct extension for successful merge full_filename = correct_ext(full_filename) temp_filename = correct_ext(temp_filename) - dl_filename = existing_file(full_filename, temp_filename) + dl_filename = existing_video_file(full_filename, temp_filename) info_dict['__real_download'] = False + downloaded = [] + merger = FFmpegMergerPP(self) + + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') if dl_filename is not None: self.report_file_already_downloaded(dl_filename) - elif get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-'): + elif fd: + for f in requested_formats if fd != FFmpegFD else []: + f['filepath'] = fname = prepend_extension( + correct_ext(temp_filename, info_dict['ext']), + 'f%s' % f['format_id'], info_dict['ext']) + downloaded.append(fname) info_dict['url'] = '\n'.join(f['url'] for f in requested_formats) success, real_download = self.dl(temp_filename, info_dict) info_dict['__real_download'] = real_download else: - downloaded = [] - merger = FFmpegMergerPP(self) if self.params.get('allow_unplayable_formats'): self.report_warning( 'You have requested merging of multiple formats ' 'while also allowing unplayable formats to be downloaded. ' 'The formats won\'t be merged to prevent data corruption.') elif not merger.available: - self.report_warning( - 'You have requested merging of multiple formats but ffmpeg is not installed. ' - 'The formats won\'t be merged.') + msg = 'You have requested merging of multiple formats but ffmpeg is not installed' + if not self.params.get('ignoreerrors'): + self.report_error(f'{msg}. Aborting due to --abort-on-error') + return + self.report_warning(f'{msg}. The formats won\'t be merged') if temp_filename == '-': - reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict) + reason = ('using a downloader other than ffmpeg' if FFmpegFD.can_merge_formats(info_dict, self.params) else 'but the formats are incompatible for simultaneous download' if merger.available else 'but ffmpeg is not installed') self.report_warning( @@ -2803,17 +3117,18 @@ class YoutubeDL(object): partial_success, real_download = self.dl(fname, new_info) info_dict['__real_download'] = info_dict['__real_download'] or real_download success = success and partial_success - if merger.available and not self.params.get('allow_unplayable_formats'): - info_dict['__postprocessors'].append(merger) - info_dict['__files_to_merge'] = downloaded - # Even if there were no downloads, it is being merged only now - info_dict['__real_download'] = True - else: - for file in downloaded: - files_to_move[file] = None + + if downloaded and merger.available and not self.params.get('allow_unplayable_formats'): + info_dict['__postprocessors'].append(merger) + info_dict['__files_to_merge'] = downloaded + # Even if there were no downloads, it is being merged only now + info_dict['__real_download'] = True + else: + for file in downloaded: + files_to_move[file] = None else: # Just a single file - dl_filename = existing_file(full_filename, temp_filename) + dl_filename = existing_video_file(full_filename, temp_filename) if dl_filename is None or dl_filename == temp_filename: # dl_filename == temp_filename could mean that the file was partially downloaded with --no-part. # So we should try to resume the download @@ -2877,14 +3192,20 @@ class YoutubeDL(object): downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None downloader = downloader.__name__ if downloader else None - ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD', - 'malformed AAC bitstream detected', FFmpegFixupM3u8PP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP) + + if info_dict.get('requested_formats') is None: # Not necessary if doing merger + ffmpeg_fixup(downloader == 'HlsFD', + 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', + FFmpegFixupM3u8PP) + ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', + 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) + + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP) fixup() try: - info_dict = self.post_process(dl_filename, info_dict, files_to_move) + replace_info_dict(self.post_process(dl_filename, info_dict, files_to_move)) except PostProcessingError as err: self.report_error('Postprocessing: %s' % str(err)) return @@ -2894,16 +3215,41 @@ class YoutubeDL(object): except Exception as err: self.report_error('post hooks: %s' % str(err)) return - must_record_download_archive = True + info_dict['__write_download_archive'] = True + + if self.params.get('force_write_download_archive'): + info_dict['__write_download_archive'] = True + + # Make sure the info_dict was modified in-place + assert info_dict is original_infodict - if must_record_download_archive or self.params.get('force_write_download_archive', False): - self.record_download_archive(info_dict) max_downloads = self.params.get('max_downloads') if max_downloads is not None and self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + def __download_wrapper(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + res = func(*args, **kwargs) + except UnavailableVideoError as e: + self.report_error(e) + except MaxDownloadsReached as e: + self.to_screen(f'[info] {e}') + raise + except DownloadCancelled as e: + self.to_screen(f'[info] {e}') + if not self.params.get('break_per_url'): + raise + else: + if self.params.get('dump_single_json', False): + self.post_extract(res) + self.to_stdout(json.dumps(self.sanitize_info(res))) + return wrapper + def download(self, url_list): """Download a given list of URLs.""" + url_list = variadic(url_list) # Passing a single URL is a common mistake outtmpl = self.outtmpl_dict['default'] if (len(url_list) > 1 and outtmpl != '-' @@ -2912,25 +3258,8 @@ class YoutubeDL(object): raise SameFileError(outtmpl) for url in url_list: - try: - # It also downloads the videos - res = self.extract_info( - url, force_generic_extractor=self.params.get('force_generic_extractor', False)) - except UnavailableVideoError: - self.report_error('unable to download video') - except MaxDownloadsReached: - self.to_screen('[info] Maximum number of downloads reached') - raise - except ExistingVideoReached: - self.to_screen('[info] Encountered a video that is already in the archive, stopping due to --break-on-existing') - raise - except RejectedVideoReached: - self.to_screen('[info] Encountered a video that did not match filter, stopping due to --break-on-reject') - raise - else: - if self.params.get('dump_single_json', False): - self.post_extract(res) - self.to_stdout(json.dumps(self.sanitize_info(res))) + self.__download_wrapper(self.extract_info)( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) return self._download_retcode @@ -2941,11 +3270,13 @@ class YoutubeDL(object): # FileInput doesn't have a read method, we can't call json.load info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) try: - self.process_ie_result(info, download=True) - except (DownloadError, EntryNotInPlaylist, ThrottledDownload): + self.__download_wrapper(self.process_ie_result)(info, download=True) + except (DownloadError, EntryNotInPlaylist, ReExtractInfo) as e: + if not isinstance(e, EntryNotInPlaylist): + self.to_stderr('\r') webpage_url = info.get('webpage_url') if webpage_url is not None: - self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}') return self.download([webpage_url]) else: raise @@ -2957,22 +3288,26 @@ class YoutubeDL(object): if info_dict is None: return info_dict info_dict.setdefault('epoch', int(time.time())) - remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict - keep_keys = ['_type'], # Always keep this to facilitate load-info-json + info_dict.setdefault('_type', 'video') + if remove_private_keys: - remove_keys |= { - 'requested_formats', 'requested_subtitles', 'requested_entries', - 'filepath', 'entries', 'original_url', 'playlist_autonumber', + reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in { + 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', + 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } - empty_values = (None, {}, [], set(), tuple()) - reject = lambda k, v: k not in keep_keys and ( - k.startswith('_') or k in remove_keys or v in empty_values) else: - reject = lambda k, v: k in remove_keys - filter_fn = lambda obj: ( - list(map(filter_fn, obj)) if isinstance(obj, (LazyList, list, tuple, set)) - else obj if not isinstance(obj, dict) - else dict((k, filter_fn(v)) for k, v in obj.items() if not reject(k, v))) + reject = lambda k, v: False + + def filter_fn(obj): + if isinstance(obj, dict): + return {k: filter_fn(v) for k, v in obj.items() if not reject(k, v)} + elif isinstance(obj, (list, tuple, set, LazyList)): + return list(map(filter_fn, obj)) + elif obj is None or isinstance(obj, (str, int, float, bool)): + return obj + else: + return repr(obj) + return filter_fn(info_dict) @staticmethod @@ -2980,6 +3315,19 @@ class YoutubeDL(object): ''' Alias of sanitize_info for backward compatibility ''' return YoutubeDL.sanitize_info(info_dict, actually_filter) + @staticmethod + def post_extract(info_dict): + def actual_post_extract(info_dict): + if info_dict.get('_type') in ('playlist', 'multi_video'): + for video_dict in info_dict.get('entries', {}): + actual_post_extract(video_dict or {}) + return + + post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {}) + info_dict.update(post_extractor()) + + actual_post_extract(info_dict or {}) + def run_pp(self, pp, infodict): files_to_delete = [] if '__files_to_move' not in infodict: @@ -3009,45 +3357,26 @@ class YoutubeDL(object): del infodict['__files_to_move'][old_filename] return infodict - @staticmethod - def post_extract(info_dict): - def actual_post_extract(info_dict): - if info_dict.get('_type') in ('playlist', 'multi_video'): - for video_dict in info_dict.get('entries', {}): - actual_post_extract(video_dict or {}) - return - - post_extractor = info_dict.get('__post_extractor') or (lambda: {}) - extra = post_extractor().items() - info_dict.update(extra) - info_dict.pop('__post_extractor', None) - - original_infodict = info_dict.get('__original_infodict') or {} - original_infodict.update(extra) - original_infodict.pop('__post_extractor', None) - - actual_post_extract(info_dict or {}) + def run_all_pps(self, key, info, *, additional_pps=None): + self._forceprint(key, info) + for pp in (additional_pps or []) + self._pps[key]: + info = self.run_pp(pp, info) + return info def pre_process(self, ie_info, key='pre_process', files_to_move=None): info = dict(ie_info) info['__files_to_move'] = files_to_move or {} - for pp in self._pps[key]: - info = self.run_pp(pp, info) + info = self.run_all_pps(key, info) return info, info.pop('__files_to_move', None) - def post_process(self, filename, ie_info, files_to_move=None): + def post_process(self, filename, info, files_to_move=None): """Run all the postprocessors on the given file.""" - info = dict(ie_info) info['filepath'] = filename info['__files_to_move'] = files_to_move or {} - - for pp in ie_info.get('__postprocessors', []) + self._pps['post_process']: - info = self.run_pp(pp, info) + info = self.run_all_pps('post_process', info, additional_pps=info.get('__postprocessors')) info = self.run_pp(MoveFilesAfterDownloadPP(self), info) del info['__files_to_move'] - for pp in self._pps['after_move']: - info = self.run_pp(pp, info) - return info + return self.run_all_pps('after_move', info) def _make_archive_id(self, info_dict): video_id = info_dict.get('id') @@ -3086,41 +3415,46 @@ class YoutubeDL(object): return vid_id = self._make_archive_id(info_dict) assert vid_id + self.write_debug(f'Adding to archive: {vid_id}') with locked_file(fn, 'a', encoding='utf-8') as archive_file: archive_file.write(vid_id + '\n') self.archive.add(vid_id) @staticmethod def format_resolution(format, default='unknown'): - is_images = format.get('vcodec') == 'none' and format.get('acodec') == 'none' if format.get('vcodec') == 'none' and format.get('acodec') != 'none': return 'audio only' if format.get('resolution') is not None: return format['resolution'] if format.get('width') and format.get('height'): - res = '%dx%d' % (format['width'], format['height']) + return '%dx%d' % (format['width'], format['height']) elif format.get('height'): - res = '%sp' % format['height'] + return '%sp' % format['height'] elif format.get('width'): - res = '%dx?' % format['width'] - elif is_images: - return 'images' - else: - return default - return f'{res} images' if is_images else res + return '%dx?' % format['width'] + return default + + def _list_format_headers(self, *headers): + if self.params.get('listformats_table', True) is not False: + return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return headers def _format_note(self, fdict): res = '' if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' + res += '(unsupported)' if fdict.get('language'): if res: res += ' ' - res += '[%s] ' % fdict['language'] + res += '[%s]' % fdict['language'] if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' + if res: + res += ' ' + res += fdict['format_note'] if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] + if res: + res += ', ' + res += '%4dk' % fdict['tbr'] if fdict.get('container') is not None: if res: res += ', ' @@ -3165,83 +3499,97 @@ class YoutubeDL(object): res += '~' + format_bytes(fdict['filesize_approx']) return res - def list_formats(self, info_dict): + def render_formats_table(self, info_dict): + if not info_dict.get('formats') and not info_dict.get('url'): + return None + formats = info_dict.get('formats', [info_dict]) - new_format = ( - 'list-formats' not in self.params.get('compat_opts', []) - and self.params.get('listformats_table', True) is not False) - if new_format: + if not self.params.get('listformats_table', True) is not False: table = [ [ format_field(f, 'format_id'), format_field(f, 'ext'), self.format_resolution(f), - format_field(f, 'fps', '%d'), - format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), - '|', - format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes), - format_field(f, 'tbr', '%4dk'), - shorten_protocol_name(f.get('protocol', '').replace("native", "n")), - '|', - format_field(f, 'vcodec', default='unknown').replace('none', ''), - format_field(f, 'vbr', '%4dk'), - format_field(f, 'acodec', default='unknown').replace('none', ''), - format_field(f, 'abr', '%3dk'), - format_field(f, 'asr', '%5dHz'), - ', '.join(filter(None, ( - 'UNSUPPORTED' if f.get('ext') in ('f4f', 'f4m') else '', - format_field(f, 'language', '[%s]'), - format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - ))), + self._format_note(f) ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] - header_line = ['ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', '|', ' FILESIZE', ' TBR', 'PROTO', - '|', 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO'] - else: - table = [ - [ - format_field(f, 'format_id'), - format_field(f, 'ext'), - self.format_resolution(f), - self._format_note(f)] - for f in formats - if f.get('preference') is None or f['preference'] >= -1000] - header_line = ['format code', 'extension', 'resolution', 'note'] - - self.to_screen( - '[info] Available formats for %s:' % info_dict['id']) - self.to_stdout(render_table( - header_line, table, delim=new_format, extraGap=(0 if new_format else 1), hideEmpty=new_format)) - - def list_thumbnails(self, info_dict): - thumbnails = list(info_dict.get('thumbnails')) + return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) + + delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) + table = [ + [ + self._format_screen(format_field(f, 'format_id'), self.Styles.ID), + format_field(f, 'ext'), + format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), + format_field(f, 'fps', '\t%d'), + format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + delim, + format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), + format_field(f, 'tbr', '\t%dk'), + shorten_protocol_name(f.get('protocol', '')), + delim, + format_field(f, 'vcodec', default='unknown').replace( + 'none', 'images' if f.get('acodec') == 'none' + else self._format_screen('audio only', self.Styles.SUPPRESS)), + format_field(f, 'vbr', '\t%dk'), + format_field(f, 'acodec', default='unknown').replace( + 'none', '' if f.get('vcodec') == 'none' + else self._format_screen('video only', self.Styles.SUPPRESS)), + format_field(f, 'abr', '\t%dk'), + format_field(f, 'asr', '\t%dHz'), + join_nonempty( + self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, + format_field(f, 'language', '[%s]'), + join_nonempty(format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), + delim=' '), + ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + header_line = self._list_format_headers( + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') + + return render_table( + header_line, table, hide_empty=True, + delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)) + + def render_thumbnails_table(self, info_dict): + thumbnails = list(info_dict.get('thumbnails') or []) if not thumbnails: - self.to_screen('[info] No thumbnails present for %s' % info_dict['id']) - return - - self.to_screen( - '[info] Thumbnails for %s:' % info_dict['id']) - self.to_stdout(render_table( - ['ID', 'width', 'height', 'URL'], - [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails])) - - def list_subtitles(self, video_id, subtitles, name='subtitles'): - if not subtitles: - self.to_screen('%s has no %s' % (video_id, name)) - return - self.to_screen( - 'Available %s for %s:' % (name, video_id)) + return None + return render_table( + self._list_format_headers('ID', 'Width', 'Height', 'URL'), + [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]) + def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): exts, names = zip(*((f['ext'], f.get('name') or 'unknown') for f in reversed(formats))) if len(set(names)) == 1: names = [] if names[0] == 'unknown' else names[:1] return [lang, ', '.join(names), ', '.join(exts)] - self.to_stdout(render_table( - ['Language', 'Name', 'Formats'], + if not subtitles: + return None + return render_table( + self._list_format_headers('Language', 'Name', 'Formats'), [_row(lang, formats) for lang, formats in subtitles.items()], - hideEmpty=True)) + hide_empty=True) + + def __list_table(self, video_id, name, func, *args): + table = func(*args) + if not table: + self.to_screen(f'{video_id} has no {name}') + return + self.to_screen(f'[info] Available {name} for {video_id}:') + self.to_stdout(table) + + def list_formats(self, info_dict): + self.__list_table(info_dict['id'], 'formats', self.render_formats_table, info_dict) + + def list_thumbnails(self, info_dict): + self.__list_table(info_dict['id'], 'thumbnails', self.render_thumbnails_table, info_dict) + + def list_subtitles(self, video_id, subtitles, name='subtitles'): + self.__list_table(video_id, name, self.render_subtitles_table, video_id, subtitles) def urlopen(self, req): """ Start an HTTP download """ @@ -3252,45 +3600,61 @@ class YoutubeDL(object): def print_debug_header(self): if not self.params.get('verbose'): return - get_encoding = lambda stream: getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__) - encoding_str = ( - '[debug] Encodings: locale %s, fs %s, stdout %s, stderr %s, pref %s\n' % ( - locale.getpreferredencoding(), - sys.getfilesystemencoding(), - get_encoding(self._screen_file), get_encoding(self._err_file), - self.get_encoding())) + + def get_encoding(stream): + ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) + if not supports_terminal_sequences(stream): + from .compat import WINDOWS_VT_MODE + ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' + return ret + + encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + get_encoding(self._out_files['screen']), get_encoding(self._out_files['error']), + self.get_encoding()) logger = self.params.get('logger') if logger: write_debug = lambda msg: logger.debug(f'[debug] {msg}') write_debug(encoding_str) else: - write_debug = lambda msg: self._write_string(f'[debug] {msg}') - write_string(encoding_str, encoding=None) - - write_debug('hypervideo version %s%s\n' % (__version__, '' if source == 'unknown' else f' ({source})')) - if _LAZY_LOADER: - write_debug('Lazy loading extractors enabled\n') + write_string(f'[debug] {encoding_str}\n', encoding=None) + write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') + + source = detect_variant() + write_debug(join_nonempty( + 'hypervideo version', __version__, + f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', + '' if source == 'unknown' else f'({source})', + delim=' ')) + if not _LAZY_LOADER: + if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + write_debug('Lazy loading extractors is forcibly disabled') + else: + write_debug('Lazy loading extractors is disabled') if plugin_extractors or plugin_postprocessors: - write_debug('Plugins: %s\n' % [ + write_debug('Plugins: %s' % [ '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) if self.params.get('compat_opts'): - write_debug('Compatibility options: %s\n' % ', '.join(self.params.get('compat_opts'))) - try: - sp = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = process_communicate_or_kill(sp) - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_debug('Git HEAD: %s\n' % out) - except Exception: + write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) + + if source == 'source': try: - sys.exc_clear() + sp = Popen( + ['git', 'rev-parse', '--short', 'HEAD'], + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__))) + out, err = sp.communicate_or_kill() + out = out.decode().strip() + if re.match('[0-9a-f]+', out): + write_debug('Git HEAD: %s' % out) except Exception: - pass + try: + sys.exc_clear() + except Exception: + pass def python_implementation(): impl_name = platform.python_implementation() @@ -3298,46 +3662,49 @@ class YoutubeDL(object): return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] return impl_name - write_debug('Python version %s (%s %s) - %s\n' % ( + write_debug('Python version %s (%s %s) - %s' % ( platform.python_version(), python_implementation(), platform.architecture()[0], platform_name())) - exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) + ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} + if ffmpeg_features: + exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features) + exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( f'{exe} {v}' for exe, v in sorted(exe_versions.items()) if v ) or 'none' - write_debug('exe versions: %s\n' % exe_str) + write_debug('exe versions: %s' % exe_str) from .downloader.websocket import has_websockets from .postprocessor.embedthumbnail import has_mutagen - from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE + from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE - lib_str = ', '.join(sorted(filter(None, ( + lib_str = join_nonempty( + compat_brotli and compat_brotli.__name__, + has_certifi and 'certifi', compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], - has_websockets and 'websockets', + SECRETSTORAGE_AVAILABLE and 'secretstorage', has_mutagen and 'mutagen', SQLITE_AVAILABLE and 'sqlite', - KEYRING_AVAILABLE and 'keyring', - )))) or 'none' - write_debug('Optional libraries: %s\n' % lib_str) - write_debug('ANSI escape support: stdout = %s, stderr = %s\n' % ( - supports_terminal_sequences(self._screen_file), - supports_terminal_sequences(self._err_file))) + has_websockets and 'websockets', + delim=', ') or 'none' + write_debug('Optional libraries: %s' % lib_str) proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - write_debug('Proxy map: ' + compat_str(proxy_map) + '\n') + write_debug(f'Proxy map: {proxy_map}') - if self.params.get('call_home', False): + # Not implemented + if False and self.params.get('call_home'): ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') - write_debug('Public IP address: %s\n' % ipaddr) - return + write_debug('Public IP address: %s' % ipaddr) latest_version = self.urlopen( 'https://yt-dl.org/latest/version').read().decode('utf-8') if version_tuple(latest_version) > version_tuple(__version__): @@ -3410,8 +3777,10 @@ class YoutubeDL(object): encoding = preferredencoding() return encoding - def _write_info_json(self, label, ie_result, infofn): - ''' Write infojson and returns True = written, False = skip, None = error ''' + def _write_info_json(self, label, ie_result, infofn, overwrite=None): + ''' Write infojson and returns True = written, 'exists' = Already exists, False = skip, None = error ''' + if overwrite is None: + overwrite = self.params.get('overwrites', True) if not self.params.get('writeinfojson'): return False elif not infofn: @@ -3419,16 +3788,17 @@ class YoutubeDL(object): return False elif not self._ensure_dir_exists(infofn): return None - elif not self.params.get('overwrites', True) and os.path.exists(infofn): + elif not overwrite and os.path.exists(infofn): self.to_screen(f'[info] {label.title()} metadata is already present') - else: - self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') - try: - write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) - except (OSError, IOError): - self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') - return None - return True + return 'exists' + + self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') + try: + write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) + return True + except (OSError, IOError): + self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') + return None def _write_description(self, label, ie_result, descfn): ''' Write description and returns True = written, False = skip, None = error ''' @@ -3471,10 +3841,11 @@ class YoutubeDL(object): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) sub_filename_final = subtitles_filename(sub_filename_base, sub_lang, sub_format, info_dict.get('ext')) - if not self.params.get('overwrites', True) and os.path.exists(sub_filename): + existing_sub = self.existing_file((sub_filename_final, sub_filename)) + if existing_sub: self.to_screen(f'[info] Video subtitle {sub_lang}.{sub_format} is already present') - sub_info['filepath'] = sub_filename - ret.append((sub_filename, sub_filename_final)) + sub_info['filepath'] = existing_sub + ret.append((existing_sub, sub_filename_final)) continue self.to_screen(f'[info] Writing video subtitles to: {sub_filename}') @@ -3497,9 +3868,13 @@ class YoutubeDL(object): self.dl(sub_filename, sub_copy, subtitle=True) sub_info['filepath'] = sub_filename ret.append((sub_filename, sub_filename_final)) - except (ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: - self.report_warning(f'Unable to download video subtitles for {sub_lang!r}: {err}') - continue + except (DownloadError, ExtractorError, IOError, OSError, ValueError) + network_exceptions as err: + msg = f'Unable to download video subtitles for {sub_lang!r}: {err}' + if self.params.get('ignoreerrors') is not True: # False or 'only_download' + if not self.params.get('ignoreerrors'): + self.report_error(msg) + raise DownloadError(msg) + self.report_warning(msg) return ret def _write_thumbnails(self, label, info_dict, filename, thumb_filename_base=None): @@ -3516,26 +3891,29 @@ class YoutubeDL(object): self.write_debug(f'Skipping writing {label} thumbnail') return ret - for t in thumbnails[::-1]: + for idx, t in list(enumerate(thumbnails))[::-1]: thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') - thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '') + thumb_display_id = f'{label} thumbnail {t["id"]}' thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) - if not self.params.get('overwrites', True) and os.path.exists(thumb_filename): - ret.append((thumb_filename, thumb_filename_final)) - t['filepath'] = thumb_filename - self.to_screen(f'[info] {thumb_display_id.title()} is already present') + existing_thumb = self.existing_file((thumb_filename_final, thumb_filename)) + if existing_thumb: + self.to_screen('[info] %s is already present' % ( + thumb_display_id if multiple else f'{label} thumbnail').capitalize()) + t['filepath'] = existing_thumb + ret.append((existing_thumb, thumb_filename_final)) else: self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: - uf = self.urlopen(t['url']) + uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename except network_exceptions as err: + thumbnails.pop(idx) self.report_warning(f'Unable to download {thumb_display_id}: {err}') if ret and not write_all: break diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py index d8b7de5..dc53a9e 100644 --- a/hypervideo_dl/__init__.py +++ b/hypervideo_dl/__init__.py @@ -11,32 +11,33 @@ import random import re import sys -from .options import ( - parseOpts, -) +from .options import parseOpts from .compat import ( compat_getpass, + compat_os_name, compat_shlex_quote, workaround_optparse_bug9161, ) -from .cookies import SUPPORTED_BROWSERS +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .utils import ( DateRange, decodeOption, + DownloadCancelled, DownloadError, - error_to_compat_str, - ExistingVideoReached, expand_path, + float_or_none, + GeoUtils, + int_or_none, match_filter_func, - MaxDownloadsReached, + NO_DEFAULT, parse_duration, preferredencoding, read_batch_urls, - RejectedVideoReached, render_table, SameFileError, setproctitle, std_headers, + traverse_obj, write_string, ) from .downloader import ( @@ -57,215 +58,68 @@ from .postprocessor import ( from .YoutubeDL import YoutubeDL -def _real_main(argv=None): - # Compatibility fixes for Windows - if sys.platform == 'win32': - # https://github.com/ytdl-org/youtube-dl/issues/820 - codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) - - workaround_optparse_bug9161() - - setproctitle('hypervideo') - - parser, opts, args = parseOpts(argv) - warnings = [] - - # Set user agent - if opts.user_agent is not None: - std_headers['User-Agent'] = opts.user_agent - - # Set referer - if opts.referer is not None: - std_headers['Referer'] = opts.referer - - # Custom HTTP headers - std_headers.update(opts.headers) - - # Dump user agent - if opts.dump_user_agent: - write_string(std_headers['User-Agent'] + '\n', out=sys.stdout) - sys.exit(0) - +def get_urls(urls, batchfile, verbose): # Batch file verification batch_urls = [] - if opts.batchfile is not None: + if batchfile is not None: try: - if opts.batchfile == '-': + if batchfile == '-': + write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( + 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) batchfd = sys.stdin else: batchfd = io.open( - expand_path(opts.batchfile), + expand_path(batchfile), 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) - if opts.verbose: + if verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') except IOError: - sys.exit('ERROR: batch file %s could not be read' % opts.batchfile) - all_urls = batch_urls + [url.strip() for url in args] # batch_urls are already striped in read_batch_urls + sys.exit('ERROR: batch file %s could not be read' % batchfile) _enc = preferredencoding() - all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] + return [ + url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip() + for url in batch_urls + urls] + +def print_extractor_information(opts, urls): if opts.list_extractors: for ie in list_extractors(opts.age_limit): write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout) - matchedUrls = [url for url in all_urls if ie.suitable(url)] + matchedUrls = [url for url in urls if ie.suitable(url)] for mu in matchedUrls: write_string(' ' + mu + '\n', out=sys.stdout) - sys.exit(0) - if opts.list_extractor_descriptions: + elif opts.list_extractor_descriptions: for ie in list_extractors(opts.age_limit): if not ie.working(): continue - desc = getattr(ie, 'IE_DESC', ie.IE_NAME) - if desc is False: + if ie.IE_DESC is False: continue - if hasattr(ie, 'SEARCH_KEY'): + desc = ie.IE_DESC or ie.IE_NAME + if getattr(ie, 'SEARCH_KEY', None) is not None: _SEARCHES = ('cute kittens', 'slithering pythons', 'falling cat', 'angry poodle', 'purple fish', 'running tortoise', 'sleeping bunny', 'burping cow') _COUNTS = ('', '5', '10', 'all') - desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) + desc += f'; "{ie.SEARCH_KEY}:" prefix (Example: "{ie.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(_SEARCHES)}")' write_string(desc + '\n', out=sys.stdout) - sys.exit(0) - if opts.ap_list_mso: + elif opts.ap_list_mso: table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) - sys.exit(0) - - # Conflicting, missing and erroneous options - if opts.usenetrc and (opts.username is not None or opts.password is not None): - parser.error('using .netrc conflicts with giving username/password') - if opts.password is not None and opts.username is None: - parser.error('account username missing\n') - if opts.ap_password is not None and opts.ap_username is None: - parser.error('TV Provider account username missing\n') - if opts.autonumber_size is not None: - if opts.autonumber_size <= 0: - parser.error('auto number size must be positive') - if opts.autonumber_start is not None: - if opts.autonumber_start < 0: - parser.error('auto number start must be positive or 0') - if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') - if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') - if opts.ratelimit is not None: - numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) - if numeric_limit is None: - parser.error('invalid rate limit specified') - opts.ratelimit = numeric_limit - if opts.throttledratelimit is not None: - numeric_limit = FileDownloader.parse_bytes(opts.throttledratelimit) - if numeric_limit is None: - parser.error('invalid rate limit specified') - opts.throttledratelimit = numeric_limit - if opts.min_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.min_filesize) - if numeric_limit is None: - parser.error('invalid min_filesize specified') - opts.min_filesize = numeric_limit - if opts.max_filesize is not None: - numeric_limit = FileDownloader.parse_bytes(opts.max_filesize) - if numeric_limit is None: - parser.error('invalid max_filesize specified') - opts.max_filesize = numeric_limit - if opts.sleep_interval is not None: - if opts.sleep_interval < 0: - parser.error('sleep interval must be positive or 0') - if opts.max_sleep_interval is not None: - if opts.max_sleep_interval < 0: - parser.error('max sleep interval must be positive or 0') - if opts.sleep_interval is None: - parser.error('min sleep interval must be specified, use --min-sleep-interval') - if opts.max_sleep_interval < opts.sleep_interval: - parser.error('max sleep interval must be greater than or equal to min sleep interval') else: - opts.max_sleep_interval = opts.sleep_interval - if opts.sleep_interval_subtitles is not None: - if opts.sleep_interval_subtitles < 0: - parser.error('subtitles sleep interval must be positive or 0') - if opts.sleep_interval_requests is not None: - if opts.sleep_interval_requests < 0: - parser.error('requests sleep interval must be positive or 0') - if opts.ap_mso and opts.ap_mso not in MSO_INFO: - parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') - if opts.overwrites: # --yes-overwrites implies --no-continue - opts.continue_dl = False - if opts.concurrent_fragment_downloads <= 0: - raise ValueError('Concurrent fragments must be positive') + return False + return True - def parse_retries(retries, name=''): - if retries in ('inf', 'infinite'): - parsed_retries = float('inf') - else: - try: - parsed_retries = int(retries) - except (TypeError, ValueError): - parser.error('invalid %sretry count specified' % name) - return parsed_retries - if opts.retries is not None: - opts.retries = parse_retries(opts.retries) - if opts.fragment_retries is not None: - opts.fragment_retries = parse_retries(opts.fragment_retries, 'fragment ') - if opts.extractor_retries is not None: - opts.extractor_retries = parse_retries(opts.extractor_retries, 'extractor ') - if opts.buffersize is not None: - numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) - if numeric_buffersize is None: - parser.error('invalid buffer size specified') - opts.buffersize = numeric_buffersize - if opts.http_chunk_size is not None: - numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) - if not numeric_chunksize: - parser.error('invalid http chunk size specified') - opts.http_chunk_size = numeric_chunksize - if opts.playliststart <= 0: - raise ValueError('Playlist start must be positive') - if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: - raise ValueError('Playlist end must be greater than playlist start') - if opts.extractaudio: - if opts.audioformat not in ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS): - parser.error('invalid audio format specified') - if opts.audioquality: - opts.audioquality = opts.audioquality.strip('k').strip('K') - if not opts.audioquality.isdigit(): - parser.error('invalid audio quality specified') - if opts.recodevideo is not None: - opts.recodevideo = opts.recodevideo.replace(' ', '') - if not re.match(FFmpegVideoConvertorPP.FORMAT_RE, opts.recodevideo): - parser.error('invalid video remux format specified') - if opts.remuxvideo is not None: - opts.remuxvideo = opts.remuxvideo.replace(' ', '') - if not re.match(FFmpegVideoRemuxerPP.FORMAT_RE, opts.remuxvideo): - parser.error('invalid video remux format specified') - if opts.convertsubtitles is not None: - if opts.convertsubtitles not in FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS: - parser.error('invalid subtitle format specified') - if opts.convertthumbnails is not None: - if opts.convertthumbnails not in FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS: - parser.error('invalid thumbnail format specified') - - if opts.cookiesfrombrowser is not None: - opts.cookiesfrombrowser = [ - part.strip() or None for part in opts.cookiesfrombrowser.split(':', 1)] - if opts.cookiesfrombrowser[0].lower() not in SUPPORTED_BROWSERS: - parser.error('unsupported browser specified for cookies') - - if opts.date is not None: - date = DateRange.day(opts.date) - else: - date = DateRange(opts.dateafter, opts.datebefore) - - compat_opts = opts.compat_opts +def set_compat_opts(opts): def _unused_compat_opt(name): - if name not in compat_opts: + if name not in opts.compat_opts: return False - compat_opts.discard(name) - compat_opts.update(['*%s' % name]) + opts.compat_opts.discard(name) + opts.compat_opts.update(['*%s' % name]) return True def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): attr = getattr(opts, opt_name) - if compat_name in compat_opts: + if compat_name in opts.compat_opts: if attr is None: setattr(opts, opt_name, not default) return True @@ -280,54 +134,204 @@ def _real_main(argv=None): set_default_compat('abort-on-error', 'ignoreerrors', 'only_download') set_default_compat('no-playlist-metafiles', 'allow_playlist_files') set_default_compat('no-clean-infojson', 'clean_infojson') - if 'format-sort' in compat_opts: + if 'no-attach-info-json' in opts.compat_opts: + if opts.embed_infojson: + _unused_compat_opt('no-attach-info-json') + else: + opts.embed_infojson = False + if 'format-sort' in opts.compat_opts: opts.format_sort.extend(InfoExtractor.FormatSort.ytdl_default) _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) _audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False) if _video_multistreams_set is False and _audio_multistreams_set is False: _unused_compat_opt('multistreams') - outtmpl_default = opts.outtmpl.get('default') - if 'filename' in compat_opts: - if outtmpl_default is None: - outtmpl_default = '%(title)s-%(id)s.%(ext)s' - opts.outtmpl.update({'default': outtmpl_default}) + if 'filename' in opts.compat_opts: + if opts.outtmpl.get('default') is None: + opts.outtmpl.update({'default': '%(title)s-%(id)s.%(ext)s'}) else: _unused_compat_opt('filename') + +def validate_options(opts): + def validate(cndn, name, value=None, msg=None): + if cndn: + return True + raise ValueError((msg or 'invalid {name} "{value}" given').format(name=name, value=value)) + + def validate_in(name, value, items, msg=None): + return validate(value is None or value in items, name, value, msg) + + def validate_regex(name, value, regex): + return validate(value is None or re.match(regex, value), name, value) + + def validate_positive(name, value, strict=False): + return validate(value is None or value > 0 or (not strict and value == 0), + name, value, '{name} "{value}" must be positive' + ('' if strict else ' or 0')) + + def validate_minmax(min_val, max_val, min_name, max_name=None): + if max_val is None or min_val is None or max_val >= min_val: + return + if not max_name: + min_name, max_name = f'min {min_name}', f'max {min_name}' + raise ValueError(f'{max_name} "{max_val}" must be must be greater than or equal to {min_name} "{min_val}"') + + # Usernames and passwords + validate(not opts.usenetrc or (opts.username is None and opts.password is None), + '.netrc', msg='using {name} conflicts with giving username/password') + validate(opts.password is None or opts.username is not None, 'account username', msg='{name} missing') + validate(opts.ap_password is None or opts.ap_username is not None, + 'TV Provider account username', msg='{name} missing') + validate_in('TV Provider', opts.ap_mso, MSO_INFO, + 'Unsupported {name} "{value}", use --ap-list-mso to get a list of supported TV Providers') + + # Numbers + validate_positive('autonumber start', opts.autonumber_start) + validate_positive('autonumber size', opts.autonumber_size, True) + validate_positive('concurrent fragments', opts.concurrent_fragment_downloads, True) + validate_positive('playlist start', opts.playliststart, True) + if opts.playlistend != -1: + validate_minmax(opts.playliststart, opts.playlistend, 'playlist start', 'playlist end') + + # Time ranges + validate_positive('subtitles sleep interval', opts.sleep_interval_subtitles) + validate_positive('requests sleep interval', opts.sleep_interval_requests) + validate_positive('sleep interval', opts.sleep_interval) + validate_positive('max sleep interval', opts.max_sleep_interval) + if opts.sleep_interval is None: + validate( + opts.max_sleep_interval is None, 'min sleep interval', + msg='{name} must be specified; use --min-sleep-interval') + elif opts.max_sleep_interval is None: + opts.max_sleep_interval = opts.sleep_interval + else: + validate_minmax(opts.sleep_interval, opts.max_sleep_interval, 'sleep interval') + + if opts.wait_for_video is not None: + min_wait, max_wait, *_ = map(parse_duration, opts.wait_for_video.split('-', 1) + [None]) + validate(min_wait is not None and not (max_wait is None and '-' in opts.wait_for_video), + 'time range to wait for video', opts.wait_for_video) + validate_minmax(min_wait, max_wait, 'time range to wait for video') + opts.wait_for_video = (min_wait, max_wait) + + # Format sort + for f in opts.format_sort: + validate_regex('format sorting', f, InfoExtractor.FormatSort.regex) + + # Postprocessor formats + validate_in('audio format', opts.audioformat, ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS)) + validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS) + validate_in('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS) + if opts.recodevideo is not None: + opts.recodevideo = opts.recodevideo.replace(' ', '') + validate_regex('video recode format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) + if opts.remuxvideo is not None: + opts.remuxvideo = opts.remuxvideo.replace(' ', '') + validate_regex('video remux format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) + if opts.audioquality: + opts.audioquality = opts.audioquality.strip('k').strip('K') + # int_or_none prevents inf, nan + validate_positive('audio quality', int_or_none(float_or_none(opts.audioquality), default=0)) + + # Retries + def parse_retries(name, value): + if value is None: + return None + elif value in ('inf', 'infinite'): + return float('inf') + try: + return int(value) + except (TypeError, ValueError): + validate(False, f'{name} retry count', value) + + opts.retries = parse_retries('download', opts.retries) + opts.fragment_retries = parse_retries('fragment', opts.fragment_retries) + opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) + opts.file_access_retries = parse_retries('file access', opts.file_access_retries) + + # Bytes + def parse_bytes(name, value): + if value is None: + return None + numeric_limit = FileDownloader.parse_bytes(value) + validate(numeric_limit is not None, 'rate limit', value) + return numeric_limit + + opts.ratelimit = parse_bytes('rate limit', opts.ratelimit) + opts.throttledratelimit = parse_bytes('throttled rate limit', opts.throttledratelimit) + opts.min_filesize = parse_bytes('min filesize', opts.min_filesize) + opts.max_filesize = parse_bytes('max filesize', opts.max_filesize) + opts.buffersize = parse_bytes('buffer size', opts.buffersize) + opts.http_chunk_size = parse_bytes('http chunk size', opts.http_chunk_size) + + # Output templates def validate_outtmpl(tmpl, msg): err = YoutubeDL.validate_outtmpl(tmpl) if err: - parser.error('invalid %s %r: %s' % (msg, tmpl, error_to_compat_str(err))) + raise ValueError(f'invalid {msg} "{tmpl}": {err}') for k, tmpl in opts.outtmpl.items(): validate_outtmpl(tmpl, f'{k} output template') - opts.forceprint = opts.forceprint or [] - for tmpl in opts.forceprint or []: - validate_outtmpl(tmpl, 'print template') + for type_, tmpl_list in opts.forceprint.items(): + for tmpl in tmpl_list: + validate_outtmpl(tmpl, f'{type_} print template') + for type_, tmpl_list in opts.print_to_file.items(): + for tmpl, file in tmpl_list: + validate_outtmpl(tmpl, f'{type_} print to file template') + validate_outtmpl(file, f'{type_} print to file filename') validate_outtmpl(opts.sponsorblock_chapter_title, 'SponsorBlock chapter title') for k, tmpl in opts.progress_template.items(): k = f'{k[:-6]} console title' if '-title' in k else f'{k} progress' validate_outtmpl(tmpl, f'{k} template') - if opts.extractaudio and not opts.keepvideo and opts.format is None: - opts.format = 'bestaudio/best' - - if outtmpl_default is not None and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio: - parser.error('Cannot download a video and extract audio into the same' - ' file! Use "{0}.%(ext)s" instead of "{0}" as the output' - ' template'.format(outtmpl_default)) - - for f in opts.format_sort: - if re.match(InfoExtractor.FormatSort.regex, f) is None: - parser.error('invalid format sort string "%s" specified' % f) - + outtmpl_default = opts.outtmpl.get('default') + if outtmpl_default == '': + opts.skip_download = None + del opts.outtmpl['default'] + if outtmpl_default and not os.path.splitext(outtmpl_default)[1] and opts.extractaudio: + raise ValueError( + 'Cannot download a video and extract audio into the same file! ' + f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') + + # Remove chapters + remove_chapters_patterns, opts.remove_ranges = [], [] + for regex in opts.remove_chapters or []: + if regex.startswith('*'): + dur = list(map(parse_duration, regex[1:].split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + opts.remove_ranges.append(tuple(dur)) + continue + raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end') + try: + remove_chapters_patterns.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}') + opts.remove_chapters = remove_chapters_patterns + + # Cookies from browser + if opts.cookiesfrombrowser: + mobj = re.match(r'(?P[^+:]+)(\s*\+\s*(?P[^:]+))?(\s*:(?P.+))?', opts.cookiesfrombrowser) + if mobj is None: + raise ValueError(f'invalid cookies from browser arguments: {opts.cookiesfrombrowser}') + browser_name, keyring, profile = mobj.group('name', 'keyring', 'profile') + browser_name = browser_name.lower() + if browser_name not in SUPPORTED_BROWSERS: + raise ValueError(f'unsupported browser specified for cookies: "{browser_name}". ' + f'Supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}') + if keyring is not None: + keyring = keyring.upper() + if keyring not in SUPPORTED_KEYRINGS: + raise ValueError(f'unsupported keyring specified for cookies: "{keyring}". ' + f'Supported keyrings are: {", ".join(sorted(SUPPORTED_KEYRINGS))}') + opts.cookiesfrombrowser = (browser_name, profile, keyring) + + # MetadataParser def metadataparser_actions(f): if isinstance(f, str): cmd = '--parse-metadata %s' % compat_shlex_quote(f) try: actions = [MetadataFromFieldPP.to_action(f)] except Exception as err: - parser.error(f'{cmd} is invalid; {err}') + raise ValueError(f'{cmd} is invalid; {err}') else: cmd = '--replace-in-metadata %s' % ' '.join(map(compat_shlex_quote, f)) actions = ((MetadataParserPP.Actions.REPLACE, x, *f[1:]) for x in f[0].split(',')) @@ -336,242 +340,296 @@ def _real_main(argv=None): try: MetadataParserPP.validate_action(*action) except Exception as err: - parser.error(f'{cmd} is invalid; {err}') + raise ValueError(f'{cmd} is invalid; {err}') yield action - if opts.parse_metadata is None: - opts.parse_metadata = [] + parse_metadata = opts.parse_metadata or [] if opts.metafromtitle is not None: - opts.parse_metadata.append('title:%s' % opts.metafromtitle) - opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, opts.parse_metadata))) + parse_metadata.append('title:%s' % opts.metafromtitle) + opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) - any_getting = opts.forceprint or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration or opts.dumpjson or opts.dump_single_json - any_printing = opts.print_json - download_archive_fn = expand_path(opts.download_archive) if opts.download_archive is not None else opts.download_archive + # Other options + geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country + if geo_bypass_code is not None: + try: + GeoUtils.random_ipv4(geo_bypass_code) + except Exception: + raise ValueError('unsupported geo-bypass country or ip-block') - # If JSON is not printed anywhere, but comments are requested, save it to file - printing_json = opts.dumpjson or opts.print_json or opts.dump_single_json - if opts.getcomments and not printing_json: - opts.writeinfojson = True + opts.match_filter = match_filter_func(opts.match_filter) + + if opts.download_archive is not None: + opts.download_archive = expand_path(opts.download_archive) + + if opts.user_agent is not None: + opts.headers.setdefault('User-Agent', opts.user_agent) + if opts.referer is not None: + opts.headers.setdefault('Referer', opts.referer) if opts.no_sponsorblock: - opts.sponsorblock_mark = set() - opts.sponsorblock_remove = set() - sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove + opts.sponsorblock_mark = opts.sponsorblock_remove = set() + + warnings, deprecation_warnings = [], [] + + # Common mistake: -f best + if opts.format == 'best': + warnings.append('.\n '.join(( + '"-f best" selects the best pre-merged format which is often not the best option', + 'To let hypervideo download and merge the best available formats, simply do not pass any format selection', + 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) + + # --(postprocessor/downloader)-args without name + def report_args_compat(name, value, key1, key2=None): + if key1 in value and key2 not in value: + warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') + return True + return False + + report_args_compat('external downloader', opts.external_downloader_args, 'default') + if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'): + opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat') + opts.postprocessor_args.setdefault('sponskrub', []) + + def report_conflict(arg1, opt1, arg2='--allow-unplayable-formats', opt2='allow_unplayable_formats', + val1=NO_DEFAULT, val2=NO_DEFAULT, default=False): + if val2 is NO_DEFAULT: + val2 = getattr(opts, opt2) + if not val2: + return + + if val1 is NO_DEFAULT: + val1 = getattr(opts, opt1) + if val1: + warnings.append(f'{arg1} is ignored since {arg2} was given') + setattr(opts, opt1, default) + + # Conflicting options + report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) + report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) + report_conflict('--exec-before-download', 'exec_before_dl_cmd', '"--exec before_dl:"', 'exec_cmd', opts.exec_cmd.get('before_dl')) + report_conflict('--id', 'useid', '--output', 'outtmpl', val2=opts.outtmpl.get('default')) + report_conflict('--remux-video', 'remuxvideo', '--recode-video', 'recodevideo') + report_conflict('--sponskrub', 'sponskrub', '--remove-chapters', 'remove_chapters') + report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-mark', 'sponsorblock_mark') + report_conflict('--sponskrub', 'sponskrub', '--sponsorblock-remove', 'sponsorblock_remove') + report_conflict('--sponskrub-cut', 'sponskrub_cut', '--split-chapter', 'split_chapters', val1=opts.sponskrub and opts.sponskrub_cut) + + # Conflicts with --allow-unplayable-formats + report_conflict('--add-metadata', 'addmetadata') + report_conflict('--embed-chapters', 'addchapters') + report_conflict('--embed-info-json', 'embed_infojson') + report_conflict('--embed-subs', 'embedsubtitles') + report_conflict('--embed-thumbnail', 'embedthumbnail') + report_conflict('--extract-audio', 'extractaudio') + report_conflict('--fixup', 'fixup', val1=(opts.fixup or '').lower() in ('', 'never', 'ignore'), default='never') + report_conflict('--recode-video', 'recodevideo') + report_conflict('--remove-chapters', 'remove_chapters', default=[]) + report_conflict('--remux-video', 'remuxvideo') + report_conflict('--sponskrub', 'sponskrub') + report_conflict('--sponsorblock-remove', 'sponsorblock_remove', default=set()) + report_conflict('--xattrs', 'xattrs') + + # Fully deprecated options + def report_deprecation(val, old, new=None): + if not val: + return + deprecation_warnings.append( + f'{old} is deprecated and may be removed in a future version. Use {new} instead' if new + else f'{old} is deprecated and may not work as expected') + + report_deprecation(opts.sponskrub, '--sponskrub', '--sponsorblock-mark or --sponsorblock-remove') + report_deprecation(not opts.prefer_ffmpeg, '--prefer-avconv', 'ffmpeg') + # report_deprecation(opts.include_ads, '--include-ads') # We may re-implement this in future + # report_deprecation(opts.call_home, '--call-home') # We may re-implement this in future + # report_deprecation(opts.writeannotations, '--write-annotations') # It's just that no website has it + + # Dependent options + opts.date = DateRange.day(opts.date) if opts.date else DateRange(opts.dateafter, opts.datebefore) + + if opts.exec_before_dl_cmd: + opts.exec_cmd['before_dl'] = opts.exec_before_dl_cmd + + if opts.useid: # --id is not deprecated in youtube-dl + opts.outtmpl['default'] = '%(id)s.%(ext)s' + + if opts.overwrites: # --force-overwrites implies --no-continue + opts.continue_dl = False if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + # Add chapters when adding metadata or marking sponsors opts.addchapters = True - opts.remove_chapters = opts.remove_chapters or [] - - def report_conflict(arg1, arg2): - warnings.append('%s is ignored since %s was given' % (arg2, arg1)) - - if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: - if opts.sponskrub: - if opts.remove_chapters: - report_conflict('--remove-chapters', '--sponskrub') - if opts.sponsorblock_mark: - report_conflict('--sponsorblock-mark', '--sponskrub') - if opts.sponsorblock_remove: - report_conflict('--sponsorblock-remove', '--sponskrub') - opts.sponskrub = False - if opts.sponskrub_cut and opts.split_chapters and opts.sponskrub is not False: - report_conflict('--split-chapter', '--sponskrub-cut') - opts.sponskrub_cut = False - - if opts.remuxvideo and opts.recodevideo: - report_conflict('--recode-video', '--remux-video') - opts.remuxvideo = False - - if opts.allow_unplayable_formats: - if opts.extractaudio: - report_conflict('--allow-unplayable-formats', '--extract-audio') - opts.extractaudio = False - if opts.remuxvideo: - report_conflict('--allow-unplayable-formats', '--remux-video') - opts.remuxvideo = False - if opts.recodevideo: - report_conflict('--allow-unplayable-formats', '--recode-video') - opts.recodevideo = False - if opts.addmetadata: - report_conflict('--allow-unplayable-formats', '--add-metadata') - opts.addmetadata = False - if opts.embedsubtitles: - report_conflict('--allow-unplayable-formats', '--embed-subs') - opts.embedsubtitles = False - if opts.embedthumbnail: - report_conflict('--allow-unplayable-formats', '--embed-thumbnail') - opts.embedthumbnail = False - if opts.xattrs: - report_conflict('--allow-unplayable-formats', '--xattrs') - opts.xattrs = False - if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'): - report_conflict('--allow-unplayable-formats', '--fixup') - opts.fixup = 'never' - if opts.remove_chapters: - report_conflict('--allow-unplayable-formats', '--remove-chapters') - opts.remove_chapters = [] - if opts.sponsorblock_remove: - report_conflict('--allow-unplayable-formats', '--sponsorblock-remove') - opts.sponsorblock_remove = set() - if opts.sponskrub: - report_conflict('--allow-unplayable-formats', '--sponskrub') - opts.sponskrub = False - - # PostProcessors - postprocessors = list(opts.add_postprocessors) - if sponsorblock_query: - postprocessors.append({ - 'key': 'SponsorBlock', - 'categories': sponsorblock_query, - 'api': opts.sponsorblock_api, - # Run this immediately after extraction is complete - 'when': 'pre_process' - }) + + if opts.extractaudio and not opts.keepvideo and opts.format is None: + # Do not unnecessarily download audio + opts.format = 'bestaudio/best' + + if opts.getcomments and opts.writeinfojson is None: + # If JSON is not printed anywhere, but comments are requested, save it to file + if not opts.dumpjson or opts.print_json or opts.dump_single_json: + opts.writeinfojson = True + + if opts.allsubtitles and not (opts.embedsubtitles or opts.writeautomaticsub): + # --all-sub automatically sets --write-sub if --write-auto-sub is not given + opts.writesubtitles = True + + if opts.addmetadata and opts.embed_infojson is None: + # If embedding metadata and infojson is present, embed it + opts.embed_infojson = 'if_exists' + + # Ask for passwords + if opts.username is not None and opts.password is None: + opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + + return warnings, deprecation_warnings + + +def get_postprocessors(opts): + yield from opts.add_postprocessors + if opts.parse_metadata: - postprocessors.append({ + yield { 'key': 'MetadataParser', 'actions': opts.parse_metadata, - # Run this immediately after extraction is complete 'when': 'pre_process' - }) + } + sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove + if sponsorblock_query: + yield { + 'key': 'SponsorBlock', + 'categories': sponsorblock_query, + 'api': opts.sponsorblock_api, + 'when': 'after_filter' + } if opts.convertsubtitles: - postprocessors.append({ + yield { 'key': 'FFmpegSubtitlesConvertor', 'format': opts.convertsubtitles, - # Run this before the actual video download 'when': 'before_dl' - }) + } if opts.convertthumbnails: - postprocessors.append({ + yield { 'key': 'FFmpegThumbnailsConvertor', 'format': opts.convertthumbnails, - # Run this before the actual video download 'when': 'before_dl' - }) - # Must be after all other before_dl - if opts.exec_before_dl_cmd: - postprocessors.append({ - 'key': 'Exec', - 'exec_cmd': opts.exec_before_dl_cmd, - 'when': 'before_dl' - }) + } if opts.extractaudio: - postprocessors.append({ + yield { 'key': 'FFmpegExtractAudio', 'preferredcodec': opts.audioformat, 'preferredquality': opts.audioquality, 'nopostoverwrites': opts.nopostoverwrites, - }) + } if opts.remuxvideo: - postprocessors.append({ + yield { 'key': 'FFmpegVideoRemuxer', 'preferedformat': opts.remuxvideo, - }) + } if opts.recodevideo: - postprocessors.append({ + yield { 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, - }) + } # If ModifyChapters is going to remove chapters, subtitles must already be in the container. if opts.embedsubtitles: - already_have_subtitle = opts.writesubtitles and 'no-keep-subs' not in compat_opts - postprocessors.append({ + keep_subs = 'no-keep-subs' not in opts.compat_opts + yield { 'key': 'FFmpegEmbedSubtitle', # already_have_subtitle = True prevents the file from being deleted after embedding - 'already_have_subtitle': already_have_subtitle - }) - if not opts.writeautomaticsub and 'no-keep-subs' not in compat_opts: + 'already_have_subtitle': opts.writesubtitles and keep_subs + } + if not opts.writeautomaticsub and keep_subs: opts.writesubtitles = True - # --all-sub automatically sets --write-sub if --write-auto-sub is not given - # this was the old behaviour if only --all-sub was given. - if opts.allsubtitles and not opts.writeautomaticsub: - opts.writesubtitles = True + # ModifyChapters must run before FFmpegMetadataPP - remove_chapters_patterns, remove_ranges = [], [] - for regex in opts.remove_chapters: - if regex.startswith('*'): - dur = list(map(parse_duration, regex[1:].split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - remove_ranges.append(tuple(dur)) - continue - parser.error(f'invalid --remove-chapters time range {regex!r}. Must be of the form ?start-end') - try: - remove_chapters_patterns.append(re.compile(regex)) - except re.error as err: - parser.error(f'invalid --remove-chapters regex {regex!r} - {err}') if opts.remove_chapters or sponsorblock_query: - postprocessors.append({ + yield { 'key': 'ModifyChapters', - 'remove_chapters_patterns': remove_chapters_patterns, + 'remove_chapters_patterns': opts.remove_chapters, 'remove_sponsor_segments': opts.sponsorblock_remove, - 'remove_ranges': remove_ranges, + 'remove_ranges': opts.remove_ranges, 'sponsorblock_chapter_title': opts.sponsorblock_chapter_title, 'force_keyframes': opts.force_keyframes_at_cuts - }) + } # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and # FFmpegExtractAudioPP as containers before conversion may not support # metadata (3gp, webm, etc.) # By default ffmpeg preserves metadata applicable for both # source and target containers. From this point the container won't change, # so metadata can be added here. - if opts.addmetadata or opts.addchapters: - postprocessors.append({ + if opts.addmetadata or opts.addchapters or opts.embed_infojson: + yield { 'key': 'FFmpegMetadata', 'add_chapters': opts.addchapters, 'add_metadata': opts.addmetadata, - }) - # Note: Deprecated + 'add_infojson': opts.embed_infojson, + } + # Deprecated # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment # but must be below EmbedSubtitle and FFmpegMetadata # See https://github.com/hypervideo/hypervideo/issues/204 , https://github.com/faissaloo/SponSkrub/issues/29 # If opts.sponskrub is None, sponskrub is used, but it silently fails if the executable can't be found if opts.sponskrub is not False: - postprocessors.append({ + yield { 'key': 'SponSkrub', 'path': opts.sponskrub_path, 'args': opts.sponskrub_args, 'cut': opts.sponskrub_cut, 'force': opts.sponskrub_force, 'ignoreerror': opts.sponskrub is None, - }) + '_from_cli': True, + } if opts.embedthumbnail: - already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails - postprocessors.append({ + yield { 'key': 'EmbedThumbnail', # already_have_thumbnail = True prevents the file from being deleted after embedding - 'already_have_thumbnail': already_have_thumbnail - }) - if not already_have_thumbnail: + 'already_have_thumbnail': opts.writethumbnail + } + if not opts.writethumbnail: opts.writethumbnail = True opts.outtmpl['pl_thumbnail'] = '' if opts.split_chapters: - postprocessors.append({ + yield { 'key': 'FFmpegSplitChapters', 'force_keyframes': opts.force_keyframes_at_cuts, - }) + } # XAttrMetadataPP should be run after post-processors that may change file contents if opts.xattrs: - postprocessors.append({'key': 'XAttrMetadata'}) - # Exec must be the last PP - if opts.exec_cmd: - postprocessors.append({ + yield {'key': 'XAttrMetadata'} + if opts.concat_playlist != 'never': + yield { + 'key': 'FFmpegConcat', + 'only_multi_video': opts.concat_playlist != 'always', + 'when': 'playlist', + } + # Exec must be the last PP of each category + for when, exec_cmd in opts.exec_cmd.items(): + yield { 'key': 'Exec', - 'exec_cmd': opts.exec_cmd, - # Run this only after the files have been moved to their final locations - 'when': 'after_move' - }) + 'exec_cmd': exec_cmd, + 'when': when, + } - def report_args_compat(arg, name): - warnings.append('%s given without specifying name. The arguments will be given to all %s' % (arg, name)) - if 'default' in opts.external_downloader_args: - report_args_compat('--downloader-args', 'external downloaders') +def parse_options(argv=None): + """ @returns (parser, opts, urls, ydl_opts) """ + parser, opts, urls = parseOpts(argv) + urls = get_urls(urls, opts.batchfile, opts.verbose) - if 'default-compat' in opts.postprocessor_args and 'default' not in opts.postprocessor_args: - report_args_compat('--post-processor-args', 'post-processors') - opts.postprocessor_args.setdefault('sponskrub', []) - opts.postprocessor_args['default'] = opts.postprocessor_args['default-compat'] + set_compat_opts(opts) + try: + warnings, deprecation_warnings = validate_options(opts) + except ValueError as err: + parser.error(f'{err}\n') + + postprocessors = list(get_postprocessors(opts)) + + any_getting = (any(opts.forceprint.values()) or opts.dumpjson or opts.dump_single_json + or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail + or opts.getdescription or opts.getfilename or opts.getformat or opts.getduration) + + any_printing = opts.print_json final_ext = ( opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS @@ -579,11 +637,7 @@ def _real_main(argv=None): else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best') else None) - match_filter = ( - None if opts.match_filter is None - else match_filter_func(opts.match_filter)) - - ydl_opts = { + return parser, opts, urls, { 'usenetrc': opts.usenetrc, 'netrc_location': opts.netrc_location, 'username': opts.username, @@ -604,6 +658,7 @@ def _real_main(argv=None): 'forcefilename': opts.getfilename, 'forceformat': opts.getformat, 'forceprint': opts.forceprint, + 'print_to_file': opts.print_to_file, 'forcejson': opts.dumpjson or opts.print_json, 'dump_single_json': opts.dump_single_json, 'force_write_download_archive': opts.force_write_download_archive, @@ -632,6 +687,7 @@ def _real_main(argv=None): 'throttledratelimit': opts.throttledratelimit, 'overwrites': opts.overwrites, 'retries': opts.retries, + 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, @@ -649,7 +705,7 @@ def _real_main(argv=None): 'playlistreverse': opts.playlist_reverse, 'playlistrandom': opts.playlist_random, 'noplaylist': opts.noplaylist, - 'logtostderr': outtmpl_default == '-', + 'logtostderr': opts.outtmpl.get('default') == '-', 'consoletitle': opts.consoletitle, 'nopart': opts.nopart, 'updatetime': opts.updatetime, @@ -659,8 +715,8 @@ def _real_main(argv=None): 'allow_playlist_files': opts.allow_playlist_files, 'clean_infojson': opts.clean_infojson, 'getcomments': opts.getcomments, - 'writethumbnail': opts.writethumbnail, - 'write_all_thumbnails': opts.write_all_thumbnails, + 'writethumbnail': opts.writethumbnail is True, + 'write_all_thumbnails': opts.writethumbnail == 'all', 'writelink': opts.writelink, 'writeurllink': opts.writeurllink, 'writewebloclink': opts.writewebloclink, @@ -685,18 +741,21 @@ def _real_main(argv=None): 'max_filesize': opts.max_filesize, 'min_views': opts.min_views, 'max_views': opts.max_views, - 'daterange': date, + 'daterange': opts.date, 'cachedir': opts.cachedir, 'youtube_print_sig_code': opts.youtube_print_sig_code, 'age_limit': opts.age_limit, - 'download_archive': download_archive_fn, + 'download_archive': opts.download_archive, 'break_on_existing': opts.break_on_existing, 'break_on_reject': opts.break_on_reject, + 'break_per_url': opts.break_per_url, 'skip_playlist_after_errors': opts.skip_playlist_after_errors, 'cookiefile': opts.cookiefile, 'cookiesfrombrowser': opts.cookiesfrombrowser, + 'legacyserverconnect': opts.legacy_server_connect, 'nocheckcertificate': opts.no_check_certificate, 'prefer_insecure': opts.prefer_insecure, + 'http_headers': opts.headers, 'proxy': opts.proxy, 'socket_timeout': opts.socket_timeout, 'bidi_workaround': opts.bidi_workaround, @@ -710,6 +769,8 @@ def _real_main(argv=None): 'youtube_include_hls_manifest': opts.youtube_include_hls_manifest, 'encoding': opts.encoding, 'extract_flat': opts.extract_flat, + 'live_from_start': opts.live_from_start, + 'wait_for_video': opts.wait_for_video, 'mark_watched': opts.mark_watched, 'merge_output_format': opts.merge_output_format, 'final_ext': final_ext, @@ -725,7 +786,7 @@ def _real_main(argv=None): 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, - 'match_filter': match_filter, + 'match_filter': opts.match_filter, 'no_color': opts.no_color, 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, @@ -738,12 +799,35 @@ def _real_main(argv=None): 'geo_bypass': opts.geo_bypass, 'geo_bypass_country': opts.geo_bypass_country, 'geo_bypass_ip_block': opts.geo_bypass_ip_block, - 'warnings': warnings, - 'compat_opts': compat_opts, + '_warnings': warnings, + '_deprecation_warnings': deprecation_warnings, + 'compat_opts': opts.compat_opts, } + +def _real_main(argv=None): + # Compatibility fixes for Windows + if sys.platform == 'win32': + # https://github.com/ytdl-org/youtube-dl/issues/820 + codecs.register(lambda name: codecs.lookup('utf-8') if name == 'cp65001' else None) + + workaround_optparse_bug9161() + + setproctitle('hypervideo') + + parser, opts, all_urls, ydl_opts = parse_options(argv) + + # Dump user agent + if opts.dump_user_agent: + ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) + write_string(f'{ua}\n', out=sys.stdout) + sys.exit(0) + + if print_extractor_information(opts, all_urls): + sys.exit(0) + with YoutubeDL(ydl_opts) as ydl: - actual_use = len(all_urls) or opts.load_info_filename + actual_use = all_urls or opts.load_info_filename # Remove cache dir if opts.rm_cachedir: @@ -761,7 +845,7 @@ def _real_main(argv=None): retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: retcode = ydl.download(all_urls) - except (MaxDownloadsReached, ExistingVideoReached, RejectedVideoReached): + except DownloadCancelled: ydl.to_screen('Aborting remaining downloads') retcode = 101 @@ -773,15 +857,21 @@ def main(argv=None): _real_main(argv) except DownloadError: sys.exit(1) - except SameFileError: - sys.exit('ERROR: fixed output name but more than one file to download') + except SameFileError as e: + sys.exit(f'ERROR: {e}') except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') - except BrokenPipeError: + except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(r'\nERROR: {err}') + sys.exit(f'\nERROR: {e}') -__all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] +__all__ = [ + 'main', + 'YoutubeDL', + 'parse_options', + 'gen_extractors', + 'list_extractors', +] diff --git a/hypervideo_dl/__main__.py b/hypervideo_dl/__main__.py old mode 100755 new mode 100644 diff --git a/hypervideo_dl/aes.py b/hypervideo_dl/aes.py index 60cdeb7..b37f0dd 100644 --- a/hypervideo_dl/aes.py +++ b/hypervideo_dl/aes.py @@ -2,8 +2,15 @@ from __future__ import unicode_literals from math import ceil -from .compat import compat_b64decode, compat_pycrypto_AES -from .utils import bytes_to_intlist, intlist_to_bytes +from .compat import ( + compat_b64decode, + compat_ord, + compat_pycrypto_AES, +) +from .utils import ( + bytes_to_intlist, + intlist_to_bytes, +) if compat_pycrypto_AES: @@ -25,9 +32,55 @@ else: return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) +def unpad_pkcs7(data): + return data[:-compat_ord(data[-1])] + + BLOCK_SIZE_BYTES = 16 +def aes_ecb_encrypt(data, key, iv=None): + """ + Encrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_encrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ecb_decrypt(data, key, iv=None): + """ + Decrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_decrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + def aes_ctr_decrypt(data, key, iv): """ Decrypt with aes in counter mode @@ -464,5 +517,6 @@ __all__ = [ 'aes_encrypt', 'aes_gcm_decrypt_and_verify', 'aes_gcm_decrypt_and_verify_bytes', - 'key_expansion' + 'key_expansion', + 'unpad_pkcs7', ] diff --git a/hypervideo_dl/compat.py b/hypervideo_dl/compat.py index 5e0e5d8..bdea14c 100644 --- a/hypervideo_dl/compat.py +++ b/hypervideo_dl/compat.py @@ -2,6 +2,7 @@ import asyncio import base64 +import collections import ctypes import getpass import html @@ -19,6 +20,7 @@ import shlex import shutil import socket import struct +import subprocess import sys import tokenize import urllib @@ -132,6 +134,16 @@ except AttributeError: asyncio.run = compat_asyncio_run +try: # >= 3.7 + asyncio.tasks.all_tasks +except AttributeError: + asyncio.tasks.all_tasks = asyncio.tasks.Task.all_tasks + +try: + import websockets as compat_websockets +except ImportError: + compat_websockets = None + # Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl # See https://github.com/hypervideo/hypervideo/issues/792 # https://docs.python.org/3/library/os.path.html#os.path.expanduser @@ -158,25 +170,45 @@ except ImportError: except ImportError: compat_pycrypto_AES = None +try: + import brotlicffi as compat_brotli +except ImportError: + try: + import brotli as compat_brotli + except ImportError: + compat_brotli = None + +WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None + def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 if compat_os_name != 'nt': return - os.system('') + global WINDOWS_VT_MODE + startupinfo = subprocess.STARTUPINFO() + startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW + try: + subprocess.Popen('', shell=True, startupinfo=startupinfo) + WINDOWS_VT_MODE = True + except Exception: + pass # Deprecated compat_basestring = str compat_chr = chr +compat_filter = filter compat_input = input compat_integer_types = (int, ) compat_kwargs = lambda kwargs: kwargs +compat_map = map compat_numeric_types = (int, float, complex) compat_str = str compat_xpath = lambda xpath: xpath compat_zip = zip +compat_collections_abc = collections.abc compat_HTMLParser = html.parser.HTMLParser compat_HTTPError = urllib.error.HTTPError compat_Struct = struct.Struct @@ -223,6 +255,7 @@ compat_xml_parse_error = etree.ParseError # Set public objects __all__ = [ + 'WINDOWS_VT_MODE', 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', @@ -232,7 +265,9 @@ __all__ = [ 'compat_asyncio_run', 'compat_b64decode', 'compat_basestring', + 'compat_brotli', 'compat_chr', + 'compat_collections_abc', 'compat_cookiejar', 'compat_cookiejar_Cookie', 'compat_cookies', @@ -242,6 +277,7 @@ __all__ = [ 'compat_etree_fromstring', 'compat_etree_register_namespace', 'compat_expanduser', + 'compat_filter', 'compat_get_terminal_size', 'compat_getenv', 'compat_getpass', @@ -253,6 +289,7 @@ __all__ = [ 'compat_integer_types', 'compat_itertools_count', 'compat_kwargs', + 'compat_map', 'compat_numeric_types', 'compat_ord', 'compat_os_name', @@ -284,6 +321,7 @@ __all__ = [ 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', + 'compat_websockets', 'compat_xml_parse_error', 'compat_xpath', 'compat_zip', diff --git a/hypervideo_dl/cookies.py b/hypervideo_dl/cookies.py index 38fbdfa..f963729 100644 --- a/hypervideo_dl/cookies.py +++ b/hypervideo_dl/cookies.py @@ -1,3 +1,4 @@ +import contextlib import ctypes import json import os @@ -7,17 +8,22 @@ import subprocess import sys import tempfile from datetime import datetime, timedelta, timezone +from enum import Enum, auto from hashlib import pbkdf2_hmac -from .aes import aes_cbc_decrypt_bytes, aes_gcm_decrypt_and_verify_bytes +from .aes import ( + aes_cbc_decrypt_bytes, + aes_gcm_decrypt_and_verify_bytes, + unpad_pkcs7, +) from .compat import ( compat_b64decode, compat_cookiejar_Cookie, ) from .utils import ( - bug_reports_message, + error_to_str, expand_path, - process_communicate_or_kill, + Popen, YoutubeDLCookieJar, ) @@ -31,19 +37,16 @@ except ImportError: try: - import keyring - KEYRING_AVAILABLE = True - KEYRING_UNAVAILABLE_REASON = f'due to unknown reasons{bug_reports_message()}' + import secretstorage + SECRETSTORAGE_AVAILABLE = True except ImportError: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = ( - 'as the `keyring` module is not installed. ' - 'Please install by running `python3 -m pip install keyring`. ' - 'Depending on your platform, additional packages may be required ' - 'to access the keyring; see https://pypi.org/project/keyring') + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`.') except Exception as _err: - KEYRING_AVAILABLE = False - KEYRING_UNAVAILABLE_REASON = 'as the `keyring` module could not be initialized: %s' % _err + SECRETSTORAGE_AVAILABLE = False + SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} @@ -74,8 +77,8 @@ class YDLLogger: def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl))) + browser_name, profile, keyring = _parse_browser_specification(*browser_specification) + cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) if cookie_file is not None: cookie_file = expand_path(cookie_file) @@ -87,13 +90,13 @@ def load_cookies(cookie_file, browser_specification, ydl): return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger()): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): if browser_name == 'firefox': return _extract_firefox_cookies(profile, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: - return _extract_chrome_cookies(browser_name, profile, logger) + return _extract_chrome_cookies(browser_name, profile, keyring, logger) else: raise ValueError('unknown browser: {}'.format(browser_name)) @@ -117,7 +120,7 @@ def _extract_firefox_cookies(profile, logger): raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) @@ -207,7 +210,7 @@ def _get_chromium_based_browser_settings(browser_name): } -def _extract_chrome_cookies(browser_name, profile, logger): +def _extract_chrome_cookies(browser_name, profile, keyring, logger): logger.info('Extracting cookies from {}'.format(browser_name)) if not SQLITE_AVAILABLE: @@ -234,9 +237,9 @@ def _extract_chrome_cookies(browser_name, profile, logger): raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger) + decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) @@ -247,6 +250,7 @@ def _extract_chrome_cookies(browser_name, profile, logger): 'expires_utc, {} FROM cookies'.format(secure_column)) jar = YoutubeDLCookieJar() failed_cookies = 0 + unencrypted_cookies = 0 for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): host_key = host_key.decode('utf-8') name = name.decode('utf-8') @@ -258,6 +262,8 @@ def _extract_chrome_cookies(browser_name, profile, logger): if value is None: failed_cookies += 1 continue + else: + unencrypted_cookies += 1 cookie = compat_cookiejar_Cookie( version=0, name=name, value=value, port=None, port_specified=False, @@ -270,6 +276,9 @@ def _extract_chrome_cookies(browser_name, profile, logger): else: failed_message = '' logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) + counts = decryptor.cookie_counts.copy() + counts['unencrypted'] = unencrypted_cookies + logger.debug('cookie version breakdown: {}'.format(counts)) return jar finally: if cursor is not None: @@ -305,10 +314,14 @@ class ChromeCookieDecryptor: def decrypt(self, encrypted_value): raise NotImplementedError + @property + def cookie_counts(self): + raise NotImplementedError + -def get_cookie_decryptor(browser_root, browser_keyring_name, logger): +def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) elif sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) elif sys.platform == 'win32': @@ -319,13 +332,12 @@ def get_cookie_decryptor(browser_root, browser_keyring_name, logger): class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, logger): + def __init__(self, browser_keyring_name, logger, *, keyring=None): self._logger = logger self._v10_key = self.derive_key(b'peanuts') - if KEYRING_AVAILABLE: - self._v11_key = self.derive_key(_get_linux_keyring_password(browser_keyring_name)) - else: - self._v11_key = None + password = _get_linux_keyring_password(browser_keyring_name, keyring, logger) + self._v11_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'v11': 0, 'other': 0} @staticmethod def derive_key(password): @@ -333,20 +345,27 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) elif version == b'v11': + self._cookie_counts['v11'] += 1 if self._v11_key is None: - self._logger.warning(f'cannot decrypt cookie {KEYRING_UNAVAILABLE_REASON}', only_once=True) + self._logger.warning('cannot decrypt v11 cookies: no key found', only_once=True) return None return _decrypt_aes_cbc(ciphertext, self._v11_key, self._logger) else: + self._cookie_counts['other'] += 1 return None @@ -355,6 +374,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): self._logger = logger password = _get_mac_keyring_password(browser_keyring_name, logger) self._v10_key = None if password is None else self.derive_key(password) + self._cookie_counts = {'v10': 0, 'other': 0} @staticmethod def derive_key(password): @@ -362,11 +382,16 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) + @property + def cookie_counts(self): + return self._cookie_counts + def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None @@ -374,6 +399,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): return _decrypt_aes_cbc(ciphertext, self._v10_key, self._logger) else: + self._cookie_counts['other'] += 1 # other prefixes are considered 'old data' which were stored as plaintext # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return encrypted_value @@ -383,12 +409,18 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): def __init__(self, browser_root, logger): self._logger = logger self._v10_key = _get_windows_v10_key(browser_root, logger) + self._cookie_counts = {'v10': 0, 'other': 0} + + @property + def cookie_counts(self): + return self._cookie_counts def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] if version == b'v10': + self._cookie_counts['v10'] += 1 if self._v10_key is None: self._logger.warning('cannot decrypt v10 cookies: no key found', only_once=True) return None @@ -408,6 +440,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): return _decrypt_aes_gcm(ciphertext, self._v10_key, nonce, authentication_tag, self._logger) else: + self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') @@ -422,7 +455,10 @@ def _extract_safari_cookies(profile, logger): cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') if not os.path.isfile(cookies_path): - raise FileNotFoundError('could not find safari cookies database') + logger.debug('Trying secondary cookie location') + cookies_path = os.path.expanduser('~/Library/Containers/com.apple.Safari/Data/Library/Cookies/Cookies.binarycookies') + if not os.path.isfile(cookies_path): + raise FileNotFoundError('could not find safari cookies database') with open(cookies_path, 'rb') as f: cookies_data = f.read() @@ -577,42 +613,220 @@ def parse_safari_cookies(data, jar=None, logger=YDLLogger()): return jar -def _get_linux_keyring_password(browser_keyring_name): - password = keyring.get_password('{} Keys'.format(browser_keyring_name), - '{} Safe Storage'.format(browser_keyring_name)) - if password is None: - # this sometimes occurs in KDE because chrome does not check hasEntry and instead - # just tries to read the value (which kwallet returns "") whereas keyring checks hasEntry - # to verify this: - # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" - # while starting chrome. - # this may be a bug as the intended behaviour is to generate a random password and store - # it, but that doesn't matter here. - password = '' - return password.encode('utf-8') +class _LinuxDesktopEnvironment(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.h + DesktopEnvironment + """ + OTHER = auto() + CINNAMON = auto() + GNOME = auto() + KDE = auto() + PANTHEON = auto() + UNITY = auto() + XFCE = auto() -def _get_mac_keyring_password(browser_keyring_name, logger): - if KEYRING_AVAILABLE: - logger.debug('using keyring to obtain password') - password = keyring.get_password('{} Safe Storage'.format(browser_keyring_name), browser_keyring_name) - return password.encode('utf-8') +class _LinuxKeyring(Enum): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.h + SelectedLinuxBackend + """ + KWALLET = auto() + GNOMEKEYRING = auto() + BASICTEXT = auto() + + +SUPPORTED_KEYRINGS = _LinuxKeyring.__members__.keys() + + +def _get_linux_desktop_environment(env): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/base/nix/xdg_util.cc + GetDesktopEnvironment + """ + xdg_current_desktop = env.get('XDG_CURRENT_DESKTOP', None) + desktop_session = env.get('DESKTOP_SESSION', None) + if xdg_current_desktop is not None: + xdg_current_desktop = xdg_current_desktop.split(':')[0].strip() + + if xdg_current_desktop == 'Unity': + if desktop_session is not None and 'gnome-fallback' in desktop_session: + return _LinuxDesktopEnvironment.GNOME + else: + return _LinuxDesktopEnvironment.UNITY + elif xdg_current_desktop == 'GNOME': + return _LinuxDesktopEnvironment.GNOME + elif xdg_current_desktop == 'X-Cinnamon': + return _LinuxDesktopEnvironment.CINNAMON + elif xdg_current_desktop == 'KDE': + return _LinuxDesktopEnvironment.KDE + elif xdg_current_desktop == 'Pantheon': + return _LinuxDesktopEnvironment.PANTHEON + elif xdg_current_desktop == 'XFCE': + return _LinuxDesktopEnvironment.XFCE + elif desktop_session is not None: + if desktop_session in ('mate', 'gnome'): + return _LinuxDesktopEnvironment.GNOME + elif 'kde' in desktop_session: + return _LinuxDesktopEnvironment.KDE + elif 'xfce' in desktop_session: + return _LinuxDesktopEnvironment.XFCE else: - logger.debug('using find-generic-password to obtain password') - proc = subprocess.Popen(['security', 'find-generic-password', - '-w', # write password to stdout - '-a', browser_keyring_name, # match 'account' - '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL) - try: - stdout, stderr = process_communicate_or_kill(proc) - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout - except BaseException as e: - logger.warning(f'exception running find-generic-password: {type(e).__name__}({e})') - return None + if 'GNOME_DESKTOP_SESSION_ID' in env: + return _LinuxDesktopEnvironment.GNOME + elif 'KDE_FULL_SESSION' in env: + return _LinuxDesktopEnvironment.KDE + return _LinuxDesktopEnvironment.OTHER + + +def _choose_linux_keyring(logger): + """ + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/key_storage_util_linux.cc + SelectBackend + """ + desktop_environment = _get_linux_desktop_environment(os.environ) + logger.debug('detected desktop environment: {}'.format(desktop_environment.name)) + if desktop_environment == _LinuxDesktopEnvironment.KDE: + linux_keyring = _LinuxKeyring.KWALLET + elif desktop_environment == _LinuxDesktopEnvironment.OTHER: + linux_keyring = _LinuxKeyring.BASICTEXT + else: + linux_keyring = _LinuxKeyring.GNOMEKEYRING + return linux_keyring + + +def _get_kwallet_network_wallet(logger): + """ The name of the wallet used to store network passwords. + + https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/kwallet_dbus.cc + KWalletDBus::NetworkWallet + which does a dbus call to the following function: + https://api.kde.org/frameworks/kwallet/html/classKWallet_1_1Wallet.html + Wallet::NetworkWallet + """ + default_wallet = 'kdewallet' + try: + proc = Popen([ + 'dbus-send', '--session', '--print-reply=literal', + '--dest=org.kde.kwalletd5', + '/modules/kwalletd5', + 'org.kde.KWallet.networkWallet' + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.warning('failed to read NetworkWallet') + return default_wallet + else: + network_wallet = stdout.decode('utf-8').strip() + logger.debug('NetworkWallet = "{}"'.format(network_wallet)) + return network_wallet + except Exception as e: + logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) + return default_wallet + + +def _get_kwallet_password(browser_keyring_name, logger): + logger.debug('using kwallet-query to obtain password from kwallet') + + if shutil.which('kwallet-query') is None: + logger.error('kwallet-query command not found. KWallet and kwallet-query ' + 'must be installed to read from KWallet. kwallet-query should be' + 'included in the kwallet package for your distribution') + return b'' + + network_wallet = _get_kwallet_network_wallet(logger) + + try: + proc = Popen([ + 'kwallet-query', + '--read-password', '{} Safe Storage'.format(browser_keyring_name), + '--folder', '{} Keys'.format(browser_keyring_name), + network_wallet + ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if proc.returncode != 0: + logger.error('kwallet-query failed with return code {}. Please consult ' + 'the kwallet-query man page for details'.format(proc.returncode)) + return b'' + else: + if stdout.lower().startswith(b'failed to read'): + logger.debug('failed to read password from kwallet. Using empty string instead') + # this sometimes occurs in KDE because chrome does not check hasEntry and instead + # just tries to read the value (which kwallet returns "") whereas kwallet-query + # checks hasEntry. To verify this: + # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" + # while starting chrome. + # this may be a bug as the intended behaviour is to generate a random password and store + # it, but that doesn't matter here. + return b'' + else: + logger.debug('password found') + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except Exception as e: + logger.warning(f'exception running kwallet-query: {error_to_str(e)}') + return b'' + + +def _get_gnome_keyring_password(browser_keyring_name, logger): + if not SECRETSTORAGE_AVAILABLE: + logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON)) + return b'' + # the Gnome keyring does not seem to organise keys in the same way as KWallet, + # using `dbus-monitor` during startup, it can be observed that chromium lists all keys + # and presumably searches for its key in the list. It appears that we must do the same. + # https://github.com/jaraco/keyring/issues/556 + with contextlib.closing(secretstorage.dbus_init()) as con: + col = secretstorage.get_default_collection(con) + for item in col.get_all_items(): + if item.get_label() == '{} Safe Storage'.format(browser_keyring_name): + return item.get_secret() + else: + logger.error('failed to read from keyring') + return b'' + + +def _get_linux_keyring_password(browser_keyring_name, keyring, logger): + # note: chrome/chromium can be run with the following flags to determine which keyring backend + # it has chosen to use + # chromium --enable-logging=stderr --v=1 2>&1 | grep key_storage_ + # Chromium supports a flag: --password-store= so the automatic detection + # will not be sufficient in all cases. + + keyring = _LinuxKeyring[keyring] if keyring else _choose_linux_keyring(logger) + logger.debug(f'Chosen keyring: {keyring.name}') + + if keyring == _LinuxKeyring.KWALLET: + return _get_kwallet_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.GNOMEKEYRING: + return _get_gnome_keyring_password(browser_keyring_name, logger) + elif keyring == _LinuxKeyring.BASICTEXT: + # when basic text is chosen, all cookies are stored as v10 (so no keyring password is required) + return None + assert False, f'Unknown keyring {keyring}' + + +def _get_mac_keyring_password(browser_keyring_name, logger): + logger.debug('using find-generic-password to obtain password from OSX keychain') + try: + proc = Popen( + ['security', 'find-generic-password', + '-w', # write password to stdout + '-a', browser_keyring_name, # match 'account' + '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' + stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + + stdout, stderr = proc.communicate_or_kill() + if stdout[-1:] == b'\n': + stdout = stdout[:-1] + return stdout + except Exception as e: + logger.warning(f'exception running find-generic-password: {error_to_str(e)}') + return None def _get_windows_v10_key(browser_root, logger): @@ -620,7 +834,7 @@ def _get_windows_v10_key(browser_root, logger): if path is None: logger.error('could not find local state file') return None - with open(path, 'r') as f: + with open(path, 'r', encoding='utf8') as f: data = json.load(f) try: base64_key = data['os_crypt']['encrypted_key'] @@ -640,10 +854,9 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): - plaintext = aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector) - padding_length = plaintext[-1] + plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: - return plaintext[:-padding_length].decode('utf-8') + return plaintext.decode('utf-8') except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -736,10 +949,11 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None): - browser_name = browser_name.lower() +def _parse_browser_specification(browser_name, profile=None, keyring=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') + if keyring not in (None, *SUPPORTED_KEYRINGS): + raise ValueError(f'unsupported keyring: "{keyring}"') if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) - return browser_name, profile + return browser_name, profile, keyring diff --git a/hypervideo_dl/downloader/__init__.py b/hypervideo_dl/downloader/__init__.py index 2449c74..96d484d 100644 --- a/hypervideo_dl/downloader/__init__.py +++ b/hypervideo_dl/downloader/__init__.py @@ -12,10 +12,15 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N info_copy = info_dict.copy() info_copy['to_stdout'] = to_stdout - downloaders = [_get_suitable_downloader(info_copy, proto, params, default) - for proto in (protocol or info_copy['protocol']).split('+')] + protocols = (protocol or info_copy['protocol']).split('+') + downloaders = [_get_suitable_downloader(info_copy, proto, params, default) for proto in protocols] + if set(downloaders) == {FFmpegFD} and FFmpegFD.can_merge_formats(info_copy, params): return FFmpegFD + elif (set(downloaders) == {DashSegmentsFD} + and not (to_stdout and len(protocols) > 1) + and set(protocols) == {'http_dash_segments_generator'}): + return DashSegmentsFD elif len(downloaders) == 1: return downloaders[0] return None @@ -25,6 +30,7 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N from .common import FileDownloader from .dash import DashSegmentsFD from .f4m import F4mFD +from .fc2 import FC2LiveFD from .hls import HlsFD from .http import HttpFD from .rtmp import RtmpFD @@ -41,6 +47,7 @@ from .external import ( PROTOCOL_MAP = { 'rtmp': RtmpFD, + 'rtmpe': RtmpFD, 'rtmp_ffmpeg': FFmpegFD, 'm3u8_native': HlsFD, 'm3u8': FFmpegFD, @@ -48,9 +55,11 @@ PROTOCOL_MAP = { 'rtsp': RtspFD, 'f4m': F4mFD, 'http_dash_segments': DashSegmentsFD, + 'http_dash_segments_generator': DashSegmentsFD, 'ism': IsmFD, 'mhtml': MhtmlFD, 'niconico_dmc': NiconicoDmcFD, + 'fc2_live': FC2LiveFD, 'websocket_frag': WebSocketFragmentFD, 'youtube_live_chat': YoutubeLiveChatFD, 'youtube_live_chat_replay': YoutubeLiveChatFD, @@ -62,6 +71,7 @@ def shorten_protocol_name(proto, simplify=False): 'm3u8_native': 'm3u8_n', 'rtmp_ffmpeg': 'rtmp_f', 'http_dash_segments': 'dash', + 'http_dash_segments_generator': 'dash_g', 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } @@ -70,6 +80,7 @@ def shorten_protocol_name(proto, simplify=False): 'https': 'http', 'ftps': 'ftp', 'm3u8_native': 'm3u8', + 'http_dash_segments_generator': 'dash', 'rtmp_ffmpeg': 'rtmp', 'm3u8_frag_urls': 'm3u8', 'dash_frag_urls': 'dash', @@ -108,7 +119,7 @@ def _get_suitable_downloader(info_dict, protocol, params, default): return FFmpegFD elif (external_downloader or '').lower() == 'native': return HlsFD - elif get_suitable_downloader( + elif protocol == 'm3u8_native' and get_suitable_downloader( info_dict, params, None, protocol='m3u8_frag_urls', to_stdout=info_dict['to_stdout']): return HlsFD elif params.get('hls_prefer_native') is True: diff --git a/hypervideo_dl/downloader/common.py b/hypervideo_dl/downloader/common.py index 27ca2cd..7cef3e8 100644 --- a/hypervideo_dl/downloader/common.py +++ b/hypervideo_dl/downloader/common.py @@ -4,14 +4,17 @@ import os import re import time import random +import errno from ..utils import ( decodeArgument, encodeFilename, error_to_compat_str, format_bytes, + sanitize_open, shell_quote, timeconvert, + timetuple_from_msec, ) from ..minicurses import ( MultilineLogger, @@ -38,6 +41,7 @@ class FileDownloader(object): ratelimit: Download speed limit, in bytes/sec. throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx + file_access_retries: Number of times to retry on file access error buffersize: Size of download buffer in bytes. noresizebuffer: Do not automatically resize the download buffer. continuedl: Try to continue downloads if possible. @@ -75,14 +79,12 @@ class FileDownloader(object): @staticmethod def format_seconds(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - if hours > 99: + time = timetuple_from_msec(seconds * 1000) + if time.hours > 99: return '--:--:--' - if hours == 0: - return '%02d:%02d' % (mins, secs) - else: - return '%02d:%02d:%02d' % (hours, mins, secs) + if not time.hours: + return '%02d:%02d' % time[1:-1] + return '%02d:%02d:%02d' % time[:-1] @staticmethod def calc_percent(byte_counter, data_len): @@ -94,6 +96,8 @@ class FileDownloader(object): def format_percent(percent): if percent is None: return '---.-%' + elif percent == 100: + return '100%' return '%6s' % ('%3.1f%%' % percent) @staticmethod @@ -155,7 +159,7 @@ class FileDownloader(object): return int(round(number * multiplier)) def to_screen(self, *args, **kargs): - self.ydl.to_stdout(*args, quiet=self.params.get('quiet'), **kargs) + self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) def to_stderr(self, message): self.ydl.to_stderr(message) @@ -206,13 +210,41 @@ class FileDownloader(object): def ytdl_filename(self, filename): return filename + '.ytdl' + def wrap_file_access(action, *, fatal=False): + def outer(func): + def inner(self, *args, **kwargs): + file_access_retries = self.params.get('file_access_retries', 0) + retry = 0 + while True: + try: + return func(self, *args, **kwargs) + except (IOError, OSError) as err: + retry = retry + 1 + if retry > file_access_retries or err.errno not in (errno.EACCES, errno.EINVAL): + if not fatal: + self.report_error(f'unable to {action} file: {err}') + return + raise + self.to_screen( + f'[download] Unable to {action} file due to file access error. ' + f'Retrying (attempt {retry} of {self.format_retries(file_access_retries)}) ...') + time.sleep(0.01) + return inner + return outer + + @wrap_file_access('open', fatal=True) + def sanitize_open(self, filename, open_mode): + return sanitize_open(filename, open_mode) + + @wrap_file_access('remove') + def try_remove(self, filename): + os.remove(filename) + + @wrap_file_access('rename') def try_rename(self, old_filename, new_filename): if old_filename == new_filename: return - try: - os.replace(old_filename, new_filename) - except (IOError, OSError) as err: - self.report_error(f'unable to rename file: {err}') + os.replace(old_filename, new_filename) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" @@ -245,14 +277,32 @@ class FileDownloader(object): elif self.ydl.params.get('logger'): self._multiline = MultilineLogger(self.ydl.params['logger'], lines) elif self.params.get('progress_with_newline'): - self._multiline = BreaklineStatusPrinter(self.ydl._screen_file, lines) + self._multiline = BreaklineStatusPrinter(self.ydl._out_files['screen'], lines) else: - self._multiline = MultilinePrinter(self.ydl._screen_file, lines, not self.params.get('quiet')) + self._multiline = MultilinePrinter(self.ydl._out_files['screen'], lines, not self.params.get('quiet')) + self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') def _finish_multiline_status(self): self._multiline.end() - def _report_progress_status(self, s): + _progress_styles = { + 'downloaded_bytes': 'light blue', + 'percent': 'light blue', + 'eta': 'yellow', + 'speed': 'green', + 'elapsed': 'bold white', + 'total_bytes': '', + 'total_bytes_estimate': '', + } + + def _report_progress_status(self, s, default_template): + for name, style in self._progress_styles.items(): + name = f'_{name}_str' + if name not in s: + continue + s[name] = self._format_progress(s[name], style) + s['_default_template'] = default_template % s + progress_dict = s.copy() progress_dict.pop('info_dict') progress_dict = {'info': s['info_dict'], 'progress': progress_dict} @@ -265,6 +315,10 @@ class FileDownloader(object): progress_template.get('download-title') or 'hypervideo %(progress._default_template)s', progress_dict)) + def _format_progress(self, *args, **kwargs): + return self.ydl._format_text( + self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) + def report_progress(self, s): if s['status'] == 'finished': if self.params.get('noprogress'): @@ -277,8 +331,7 @@ class FileDownloader(object): s['_elapsed_str'] = self.format_seconds(s['elapsed']) msg_template += ' in %(_elapsed_str)s' s['_percent_str'] = self.format_percent(100) - s['_default_template'] = msg_template % s - self._report_progress_status(s) + self._report_progress_status(s, msg_template) return if s['status'] != 'downloading': @@ -287,7 +340,7 @@ class FileDownloader(object): if s.get('eta') is not None: s['_eta_str'] = self.format_eta(s['eta']) else: - s['_eta_str'] = 'Unknown ETA' + s['_eta_str'] = 'Unknown' if s.get('total_bytes') and s.get('downloaded_bytes') is not None: s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) @@ -319,9 +372,12 @@ class FileDownloader(object): else: msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' else: - msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' - s['_default_template'] = msg_template % s - self._report_progress_status(s) + msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s' + if s.get('fragment_index') and s.get('fragment_count'): + msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' + elif s.get('fragment_index'): + msg_template += ' (frag %(fragment_index)s)' + self._report_progress_status(s, msg_template) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" @@ -372,6 +428,7 @@ class FileDownloader(object): 'status': 'finished', 'total_bytes': os.path.getsize(encodeFilename(filename)), }, info_dict) + self._finish_multiline_status() return True, False if subtitle is False: diff --git a/hypervideo_dl/downloader/dash.py b/hypervideo_dl/downloader/dash.py index 6444ad6..a845ee7 100644 --- a/hypervideo_dl/downloader/dash.py +++ b/hypervideo_dl/downloader/dash.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import time from ..downloader import get_suitable_downloader from .fragment import FragmentFD @@ -15,27 +16,53 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - if info_dict.get('is_live'): + if info_dict.get('is_live') and set(info_dict['protocol'].split('+')) != {'http_dash_segments_generator'}: self.report_error('Live DASH videos are not supported') - fragment_base_url = info_dict.get('fragment_base_url') - fragments = info_dict['fragments'][:1] if self.params.get( - 'test', False) else info_dict['fragments'] - + real_start = time.time() real_downloader = get_suitable_downloader( info_dict, self.params, None, protocol='dash_frag_urls', to_stdout=(filename == '-')) - ctx = { - 'filename': filename, - 'total_frags': len(fragments), - } + requested_formats = [{**info_dict, **fmt} for fmt in info_dict.get('requested_formats', [])] + args = [] + for fmt in requested_formats or [info_dict]: + try: + fragment_count = 1 if self.params.get('test') else len(fmt['fragments']) + except TypeError: + fragment_count = None + ctx = { + 'filename': fmt.get('filepath') or filename, + 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'), + 'total_frags': fragment_count, + } + + if real_downloader: + self._prepare_external_frag_download(ctx) + else: + self._prepare_and_start_frag_download(ctx, fmt) + ctx['start'] = real_start + + fragments_to_download = self._get_fragments(fmt, ctx) + + if real_downloader: + self.to_screen( + '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + info_dict['fragments'] = list(fragments_to_download) + fd = real_downloader(self.ydl, self.params) + return fd.real_download(filename, info_dict) + + args.append([ctx, fragments_to_download, fmt]) - if real_downloader: - self._prepare_external_frag_download(ctx) - else: - self._prepare_and_start_frag_download(ctx, info_dict) + return self.download_and_append_fragments_multiple(*args) + + def _resolve_fragments(self, fragments, ctx): + fragments = fragments(ctx) if callable(fragments) else fragments + return [next(iter(fragments))] if self.params.get('test') else fragments + + def _get_fragments(self, fmt, ctx): + fragment_base_url = fmt.get('fragment_base_url') + fragments = self._resolve_fragments(fmt['fragments'], ctx) - fragments_to_download = [] frag_index = 0 for i, fragment in enumerate(fragments): frag_index += 1 @@ -46,17 +73,8 @@ class DashSegmentsFD(FragmentFD): assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) - fragments_to_download.append({ + yield { 'frag_index': frag_index, 'index': i, 'url': fragment_url, - }) - - if real_downloader: - self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) - info_dict['fragments'] = fragments_to_download - fd = real_downloader(self.ydl, self.params) - return fd.real_download(filename, info_dict) - - return self.download_and_append_fragments(ctx, fragments_to_download, info_dict) + } diff --git a/hypervideo_dl/downloader/external.py b/hypervideo_dl/downloader/external.py index 74adb05..b99dc37 100644 --- a/hypervideo_dl/downloader/external.py +++ b/hypervideo_dl/downloader/external.py @@ -13,17 +13,18 @@ from ..compat import ( ) from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( + classproperty, cli_option, cli_valueless_option, cli_bool_option, _configuration_args, + determine_ext, encodeFilename, encodeArgument, handle_youtubedl_headers, check_executable, - is_outdated_version, - process_communicate_or_kill, - sanitize_open, + Popen, + remove_end, ) @@ -73,17 +74,23 @@ class ExternalFD(FragmentFD): def get_basename(cls): return cls.__name__[:-2].lower() + @classproperty + def EXE_NAME(cls): + return cls.get_basename() + @property def exe(self): - return self.get_basename() + return self.EXE_NAME @classmethod def available(cls, path=None): - path = check_executable(path or cls.get_basename(), [cls.AVAILABLE_OPT]) - if path: - cls.exe = path - return path - return False + path = check_executable( + cls.EXE_NAME if path in (None, cls.get_basename()) else path, + [cls.AVAILABLE_OPT]) + if not path: + return False + cls.exe = path + return path @classmethod def supports(cls, info_dict): @@ -106,7 +113,7 @@ class ExternalFD(FragmentFD): def _configuration_args(self, keys=None, *args, **kwargs): return _configuration_args( - self.get_basename(), self.params.get('external_downloader_args'), self.get_basename(), + self.get_basename(), self.params.get('external_downloader_args'), self.EXE_NAME, keys, *args, **kwargs) def _call_downloader(self, tmpfilename, info_dict): @@ -116,9 +123,8 @@ class ExternalFD(FragmentFD): self._debug_cmd(cmd) if 'fragments' not in info_dict: - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = process_communicate_or_kill(p) + p = Popen(cmd, stderr=subprocess.PIPE) + _, stderr = p.communicate_or_kill() if p.returncode != 0: self.to_stderr(stderr.decode('utf-8', 'replace')) return p.returncode @@ -128,9 +134,8 @@ class ExternalFD(FragmentFD): count = 0 while count <= fragment_retries: - p = subprocess.Popen( - cmd, stderr=subprocess.PIPE) - _, stderr = process_communicate_or_kill(p) + p = Popen(cmd, stderr=subprocess.PIPE) + _, stderr = p.communicate_or_kill() if p.returncode == 0: break # TODO: Decide whether to retry based on error code @@ -147,23 +152,23 @@ class ExternalFD(FragmentFD): return -1 decrypt_fragment = self.decrypter(info_dict) - dest, _ = sanitize_open(tmpfilename, 'wb') + dest, _ = self.sanitize_open(tmpfilename, 'wb') for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) try: - src, _ = sanitize_open(fragment_filename, 'rb') - except IOError: + src, _ = self.sanitize_open(fragment_filename, 'rb') + except IOError as err: if skip_unavailable_fragments and frag_index > 1: - self.to_screen('[%s] Skipping fragment %d ...' % (self.get_basename(), frag_index)) + self.report_skip_fragment(frag_index, err) continue - self.report_error('Unable to open fragment %d' % frag_index) + self.report_error(f'Unable to open fragment {frag_index}; {err}') return -1 dest.write(decrypt_fragment(fragment, src.read())) src.close() if not self.params.get('keep_fragments', False): - os.remove(encodeFilename(fragment_filename)) + self.try_remove(encodeFilename(fragment_filename)) dest.close() - os.remove(encodeFilename('%s.frag.urls' % tmpfilename)) + self.try_remove(encodeFilename('%s.frag.urls' % tmpfilename)) return 0 @@ -171,7 +176,7 @@ class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '--location', '-o', tmpfilename] + cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] @@ -199,8 +204,8 @@ class CurlFD(ExternalFD): self._debug_cmd(cmd) # curl writes the progress to stderr so don't capture it. - p = subprocess.Popen(cmd) - process_communicate_or_kill(p) + p = Popen(cmd) + p.communicate_or_kill() return p.returncode @@ -221,7 +226,7 @@ class WgetFD(ExternalFD): AVAILABLE_OPT = '--version' def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] + cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] @@ -232,7 +237,10 @@ class WgetFD(ExternalFD): retry[1] = '0' cmd += retry cmd += self._option('--bind-address', 'source_address') - cmd += self._option('--proxy', 'proxy') + proxy = self.params.get('proxy') + if proxy: + for var in ('http_proxy', 'https_proxy'): + cmd += ['--execute', '%s=%s' % (var, proxy)] cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] @@ -255,7 +263,7 @@ class Aria2cFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', - '--file-allocation=none', '-x16', '-j16', '-s16'] + '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] if 'fragments' in info_dict: cmd += ['--allow-overwrite=true', '--allow-piece-length-change=true'] else: @@ -269,6 +277,7 @@ class Aria2cFD(ExternalFD): cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += self._bool_option('--remote-time', 'updatetime', 'true', 'false', '=') + cmd += self._bool_option('--show-console-readout', 'noprogress', 'false', 'true', '=') cmd += self._configuration_args() # aria2c strips out spaces from the beginning/end of filenames and paths. @@ -293,7 +302,7 @@ class Aria2cFD(ExternalFD): for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) - stream, _ = sanitize_open(url_list_file, 'wb') + stream, _ = self.sanitize_open(url_list_file, 'wb') stream.write('\n'.join(url_list).encode('utf-8')) stream.close() cmd += ['-i', url_list_file] @@ -304,10 +313,7 @@ class Aria2cFD(ExternalFD): class HttpieFD(ExternalFD): AVAILABLE_OPT = '--version' - - @classmethod - def available(cls, path=None): - return ExternalFD.available(cls, path or 'http') + EXE_NAME = 'http' def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] @@ -446,8 +452,7 @@ class FFmpegFD(ExternalFD): if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) - a_or_v = 'a' if fmt.get('acodec') != 'none' else 'v' - args.extend(['-map', f'{i}:{a_or_v}:{stream_number}']) + args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): args += ['-fs', compat_str(self._TEST_FILE_SIZE)] @@ -461,12 +466,21 @@ class FFmpegFD(ExternalFD): args += ['-f', 'mpegts'] else: args += ['-f', 'mp4'] - if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + if (ffpp.basename == 'ffmpeg' and ffpp._features.get('needs_adtstoasc')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] elif ext == 'mp4' and tmpfilename == '-': args += ['-f', 'mpegts'] + elif ext == 'unknown_video': + ext = determine_ext(remove_end(tmpfilename, '.part')) + if ext == 'unknown_video': + self.report_warning( + 'The video format is unknown and cannot be downloaded by ffmpeg. ' + 'Explicitly set the extension in the filename to attempt download in that format') + else: + self.report_warning(f'The video format is unknown. Trying to download as {ext} according to the filename') + args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] else: args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] @@ -476,7 +490,7 @@ class FFmpegFD(ExternalFD): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) + proc = Popen(args, stdin=subprocess.PIPE, env=env) if url in ('-', 'pipe:'): self.on_process_started(proc, proc.stdin) try: @@ -488,7 +502,7 @@ class FFmpegFD(ExternalFD): # streams). Note that Windows is not affected and produces playable # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): - process_communicate_or_kill(proc, b'q') + proc.communicate_or_kill(b'q') else: proc.kill() proc.wait() @@ -500,11 +514,13 @@ class AVconvFD(FFmpegFD): pass -_BY_NAME = dict( - (klass.get_basename(), klass) +_BY_NAME = { + klass.get_basename(): klass for name, klass in globals().items() if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') -) +} + +_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()} def list_external_downloaders(): @@ -516,4 +532,4 @@ def get_external_downloader(external_downloader): downloader . """ # Drop .exe extension on Windows bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME.get(bn) + return _BY_NAME.get(bn, _BY_EXE.get(bn)) diff --git a/hypervideo_dl/downloader/f4m.py b/hypervideo_dl/downloader/f4m.py index 9da2776..0008b7c 100644 --- a/hypervideo_dl/downloader/f4m.py +++ b/hypervideo_dl/downloader/f4m.py @@ -366,7 +366,7 @@ class F4mFD(FragmentFD): ctx = { 'filename': filename, 'total_frags': total_frags, - 'live': live, + 'live': bool(live), } self._prepare_frag_download(ctx) diff --git a/hypervideo_dl/downloader/fc2.py b/hypervideo_dl/downloader/fc2.py new file mode 100644 index 0000000..157bcf2 --- /dev/null +++ b/hypervideo_dl/downloader/fc2.py @@ -0,0 +1,41 @@ +from __future__ import division, unicode_literals + +import threading + +from .common import FileDownloader +from .external import FFmpegFD + + +class FC2LiveFD(FileDownloader): + """ + Downloads FC2 live without being stopped.
+ Note, this is not a part of public API, and will be removed without notice. + DO NOT USE + """ + + def real_download(self, filename, info_dict): + ws = info_dict['ws'] + + heartbeat_lock = threading.Lock() + heartbeat_state = [None, 1] + + def heartbeat(): + try: + heartbeat_state[1] += 1 + ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1]) + except Exception: + self.to_screen('[fc2:live] Heartbeat failed') + + with heartbeat_lock: + heartbeat_state[0] = threading.Timer(30, heartbeat) + heartbeat_state[0]._daemonic = True + heartbeat_state[0].start() + + heartbeat() + + new_info_dict = info_dict.copy() + new_info_dict.update({ + 'ws': None, + 'protocol': 'live_ffmpeg', + }) + return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) diff --git a/hypervideo_dl/downloader/fragment.py b/hypervideo_dl/downloader/fragment.py index 57068db..a991c6d 100644 --- a/hypervideo_dl/downloader/fragment.py +++ b/hypervideo_dl/downloader/fragment.py @@ -1,9 +1,10 @@ from __future__ import division, unicode_literals +import http.client +import json +import math import os import time -import json -from math import ceil try: import concurrent.futures @@ -13,8 +14,9 @@ except ImportError: from .common import FileDownloader from .http import HttpFD -from ..aes import aes_cbc_decrypt_bytes +from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 from ..compat import ( + compat_os_name, compat_urllib_error, compat_struct_pack, ) @@ -22,8 +24,8 @@ from ..utils import ( DownloadError, error_to_compat_str, encodeFilename, - sanitize_open, sanitized_Request, + traverse_obj, ) @@ -31,6 +33,10 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass + def report_retry(self, err, count, retries): + super().to_screen( + f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + class FragmentFD(FileDownloader): """ @@ -44,6 +50,7 @@ class FragmentFD(FileDownloader): Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is finished + concurrent_fragment_downloads: The number of threads to use for native hls and dash downloads _no_ytdl_file: Don't use .ytdl file For each incomplete fragment download hypervideo keeps on disk a special @@ -72,8 +79,9 @@ class FragmentFD(FileDownloader): '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) - def report_skip_fragment(self, frag_index): - self.to_screen('[download] Skipping fragment %d ...' % frag_index) + def report_skip_fragment(self, frag_index, err=None): + err = f' {err};' if err else '' + self.to_screen(f'[download]{err} Skipping fragment {frag_index:d} ...') def _prepare_url(self, info_dict, url): headers = info_dict.get('http_headers') @@ -84,11 +92,11 @@ class FragmentFD(FileDownloader): self._start_frag_download(ctx, info_dict) def __do_ytdl_file(self, ctx): - return not ctx['live'] and not ctx['tmpfilename'] == '-' and not self.params.get('_no_ytdl_file') + return ctx['live'] is not True and ctx['tmpfilename'] != '-' and not self.params.get('_no_ytdl_file') def _read_ytdl_file(self, ctx): assert 'ytdl_corrupt' not in ctx - stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r') + stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'r') try: ytdl_data = json.loads(stream.read()) ctx['fragment_index'] = ytdl_data['downloader']['current_fragment']['index'] @@ -100,7 +108,7 @@ class FragmentFD(FileDownloader): stream.close() def _write_ytdl_file(self, ctx): - frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w') + frag_index_stream, _ = self.sanitize_open(self.ytdl_filename(ctx['filename']), 'w') try: downloader = { 'current_fragment': { @@ -125,14 +133,19 @@ class FragmentFD(FileDownloader): } success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: - return False, None + return False if fragment_info_dict.get('filetime'): ctx['fragment_filetime'] = fragment_info_dict.get('filetime') ctx['fragment_filename_sanitized'] = fragment_filename - return True, self._read_fragment(ctx) + return True def _read_fragment(self, ctx): - down, frag_sanitized = sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + try: + down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + except FileNotFoundError: + if ctx.get('live'): + return None + raise ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() down.close() @@ -146,7 +159,7 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): self._write_ytdl_file(ctx) if not self.params.get('keep_fragments', False): - os.remove(encodeFilename(ctx['fragment_filename_sanitized'])) + self.try_remove(encodeFilename(ctx['fragment_filename_sanitized'])) del ctx['fragment_filename_sanitized'] def _prepare_frag_download(self, ctx): @@ -165,8 +178,8 @@ class FragmentFD(FileDownloader): dl = HttpQuietDownloader( self.ydl, { - 'continuedl': True, - 'quiet': True, + 'continuedl': self.params.get('continuedl', True), + 'quiet': self.params.get('quiet'), 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), 'retries': self.params.get('retries', 0), @@ -208,7 +221,7 @@ class FragmentFD(FileDownloader): self._write_ytdl_file(ctx) assert ctx['fragment_index'] == 0 - dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) + dest_stream, tmpfilename = self.sanitize_open(tmpfilename, open_mode) ctx.update({ 'dl': dl, @@ -236,6 +249,7 @@ class FragmentFD(FileDownloader): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -266,6 +280,9 @@ class FragmentFD(FileDownloader): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] @@ -274,8 +291,8 @@ class FragmentFD(FileDownloader): state['eta'] = self.calc_eta( start, time_now, estimated_size - resume_len, state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) @@ -288,7 +305,7 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename'])) if os.path.isfile(ytdl_filename): - os.remove(ytdl_filename) + self.try_remove(ytdl_filename) elapsed = time.time() - ctx['started'] if ctx['tmpfilename'] == '-': @@ -355,9 +372,7 @@ class FragmentFD(FileDownloader): # not what it decrypts to. if self.params.get('test', False): return frag_content - padding_len = 16 - (len(frag_content) % 16) - decrypted_data = aes_cbc_decrypt_bytes(frag_content + bytes([padding_len] * padding_len), decrypt_info['KEY'], iv) - return decrypted_data[:-decrypted_data[-1]] + return unpad_pkcs7(aes_cbc_decrypt_bytes(frag_content, decrypt_info['KEY'], iv)) return decrypt_fragment @@ -366,64 +381,105 @@ class FragmentFD(FileDownloader): @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... all args must be either tuple or list ''' + interrupt_trigger = [True] max_progress = len(args) if max_progress == 1: return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) - max_workers = self.params.get('concurrent_fragment_downloads', max_progress) - self._prepare_multiline_status(max_progress) + max_workers = self.params.get('concurrent_fragment_downloads', 1) + if max_progress > 1: + self._prepare_multiline_status(max_progress) + is_live = any(traverse_obj(args, (..., 2, 'is_live'), default=[])) def thread_func(idx, ctx, fragments, info_dict, tpe): ctx['max_progress'] = max_progress ctx['progress_idx'] = idx - return self.download_and_append_fragments(ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, tpe=tpe) + return self.download_and_append_fragments( + ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, + tpe=tpe, interrupt_trigger=interrupt_trigger) class FTPE(concurrent.futures.ThreadPoolExecutor): # has to stop this or it's going to wait on the worker thread itself def __exit__(self, exc_type, exc_val, exc_tb): pass + if compat_os_name == 'nt': + def future_result(future): + while True: + try: + return future.result(0.1) + except KeyboardInterrupt: + raise + except concurrent.futures.TimeoutError: + continue + else: + def future_result(future): + return future.result() + + def interrupt_trigger_iter(fg): + for f in fg: + if not interrupt_trigger[0]: + break + yield f + spins = [] for idx, (ctx, fragments, info_dict) in enumerate(args): - tpe = FTPE(ceil(max_workers / max_progress)) - job = tpe.submit(thread_func, idx, ctx, fragments, info_dict, tpe) + tpe = FTPE(math.ceil(max_workers / max_progress)) + job = tpe.submit(thread_func, idx, ctx, interrupt_trigger_iter(fragments), info_dict, tpe) spins.append((tpe, job)) result = True for tpe, job in spins: try: - result = result and job.result() + result = result and future_result(job) + except KeyboardInterrupt: + interrupt_trigger[0] = False finally: tpe.shutdown(wait=True) + if not interrupt_trigger[0] and not is_live: + raise KeyboardInterrupt() + # we expect the user wants to stop and DO WANT the preceding postprocessors to run; + # so returning a intermediate result here instead of KeyboardInterrupt on live return result - def download_and_append_fragments(self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, tpe=None): + def download_and_append_fragments( + self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, + tpe=None, interrupt_trigger=None): + if not interrupt_trigger: + interrupt_trigger = (True, ) + fragment_retries = self.params.get('fragment_retries', 0) - is_fatal = (lambda idx: idx == 0) if self.params.get('skip_unavailable_fragments', True) else (lambda _: True) + is_fatal = ( + ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0)) + if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)) + if not pack_func: pack_func = lambda frag_content, _: frag_content def download_fragment(fragment, ctx): + if not interrupt_trigger[0]: + return + frag_index = ctx['fragment_index'] = fragment['frag_index'] + ctx['last_error'] = None headers = info_dict.get('http_headers', {}).copy() byte_range = fragment.get('byte_range') if byte_range: headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) # Never skip the first fragment - fatal = is_fatal(fragment.get('index') or (frag_index - 1)) - count, frag_content = 0, None + fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, fragment['url'], info_dict, headers) - if not success: - return False, frag_index - break - except compat_urllib_error.HTTPError as err: + if self._download_fragment(ctx, fragment['url'], info_dict, headers): + break + return + except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. # See https://github.com/ytdl-org/youtube-dl/issues/10165, # https://github.com/ytdl-org/youtube-dl/issues/10448). count += 1 + ctx['last_error'] = err if count <= fragment_retries: self.report_retry_fragment(err, frag_index, count, fragment_retries) except DownloadError: @@ -433,49 +489,46 @@ class FragmentFD(FileDownloader): break raise - if count > fragment_retries: - if not fatal: - return False, frag_index + if count > fragment_retries and fatal: ctx['dest_stream'].close() self.report_error('Giving up after %s fragment retries' % fragment_retries) - return False, frag_index - return frag_content, frag_index def append_fragment(frag_content, frag_index, ctx): - if not frag_content: - if not is_fatal(frag_index - 1): - self.report_skip_fragment(frag_index) - return True - else: - ctx['dest_stream'].close() - self.report_error( - 'fragment %s not found, unable to continue' % frag_index) - return False - self._append_fragment(ctx, pack_func(frag_content, frag_index)) + if frag_content: + self._append_fragment(ctx, pack_func(frag_content, frag_index)) + elif not is_fatal(frag_index - 1): + self.report_skip_fragment(frag_index, 'fragment not found') + else: + ctx['dest_stream'].close() + self.report_error(f'fragment {frag_index} not found, unable to continue') + return False return True decrypt_fragment = self.decrypter(info_dict) - max_workers = self.params.get('concurrent_fragment_downloads', 1) + max_workers = math.ceil( + self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1)) if can_threaded_download and max_workers > 1: def _download_fragment(fragment): ctx_copy = ctx.copy() - frag_content, frag_index = download_fragment(fragment, ctx_copy) - return fragment, frag_content, frag_index, ctx_copy.get('fragment_filename_sanitized') + download_fragment(fragment, ctx_copy) + return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: - for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments): + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): ctx['fragment_filename_sanitized'] = frag_filename ctx['fragment_index'] = frag_index - result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx) if not result: return False else: for fragment in fragments: - frag_content, frag_index = download_fragment(fragment, ctx) - result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + if not interrupt_trigger[0]: + break + download_fragment(fragment, ctx) + result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) if not result: return False diff --git a/hypervideo_dl/downloader/hls.py b/hypervideo_dl/downloader/hls.py index ef8a81b..f3f32b5 100644 --- a/hypervideo_dl/downloader/hls.py +++ b/hypervideo_dl/downloader/hls.py @@ -77,6 +77,15 @@ class HlsFD(FragmentFD): message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; ' 'Decryption will be performed natively, but will be extremely slow') if not can_download: + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), s) + if has_drm and not self.params.get('allow_unplayable_formats'): + self.report_error( + 'This video is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format') + return False message = message or 'Unsupported features have been detected' fd = FFmpegFD(self.ydl, self.params) self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}') diff --git a/hypervideo_dl/downloader/http.py b/hypervideo_dl/downloader/http.py index 2e95bb9..591a9b0 100644 --- a/hypervideo_dl/downloader/http.py +++ b/hypervideo_dl/downloader/http.py @@ -1,29 +1,30 @@ from __future__ import unicode_literals -import errno import os -import socket +import ssl import time import random -import re from .common import FileDownloader from ..compat import ( - compat_str, compat_urllib_error, + compat_http_client ) from ..utils import ( ContentTooShortError, encodeFilename, int_or_none, - sanitize_open, + parse_http_range, sanitized_Request, ThrottledDownload, + try_call, write_xattr, XAttrMetadataError, XAttrUnavailableError, ) +RESPONSE_READ_EXCEPTIONS = (TimeoutError, ConnectionError, ssl.SSLError, compat_http_client.HTTPException) + class HttpFD(FileDownloader): def real_download(self, filename, info_dict): @@ -54,11 +55,11 @@ class HttpFD(FileDownloader): ctx.open_mode = 'wb' ctx.resume_len = 0 - ctx.data_len = None ctx.block_size = self.params.get('buffersize', 1024) ctx.start_time = time.time() - ctx.chunk_size = None - throttle_start = None + + # parse given Range + req_start, req_end, _ = parse_http_range(headers.get('Range')) if self.params.get('continuedl', True): # Establish possible resume length @@ -81,43 +82,50 @@ class HttpFD(FileDownloader): class NextFragment(Exception): pass - def set_range(req, start, end): - range_header = 'bytes=%d-' % start - if end: - range_header += compat_str(end) - req.add_header('Range', range_header) - def establish_connection(): ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) if not is_test and chunk_size else chunk_size) if ctx.resume_len > 0: range_start = ctx.resume_len + if req_start is not None: + # offset the beginning of Range to be within request + range_start += req_start if ctx.is_resume: self.report_resuming_byte(ctx.resume_len) ctx.open_mode = 'ab' + elif req_start is not None: + range_start = req_start elif ctx.chunk_size > 0: range_start = 0 else: range_start = None ctx.is_resume = False - range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None - if range_end and ctx.data_len is not None and range_end >= ctx.data_len: - range_end = ctx.data_len - 1 - has_range = range_start is not None - ctx.has_range = has_range + + if ctx.chunk_size: + chunk_aware_end = range_start + ctx.chunk_size - 1 + # we're not allowed to download outside Range + range_end = chunk_aware_end if req_end is None else min(chunk_aware_end, req_end) + elif req_end is not None: + # there's no need for chunked downloads, so download until the end of Range + range_end = req_end + else: + range_end = None + + if try_call(lambda: range_start > range_end): + ctx.resume_len = 0 + ctx.open_mode = 'wb' + raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})')) + + if try_call(lambda: range_end >= ctx.content_len): + range_end = ctx.content_len - 1 + request = sanitized_Request(url, request_data, headers) + has_range = range_start is not None if has_range: - set_range(request, range_start, range_end) + request.add_header('Range', f'bytes={int(range_start)}-{int_or_none(range_end) or ""}') # Establish connection try: - try: - ctx.data = self.ydl.urlopen(request) - except (compat_urllib_error.URLError, ) as err: - # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 - reason = getattr(err, 'reason', None) - if isinstance(reason, socket.timeout): - raise RetryDownload(err) - raise err + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range @@ -125,31 +133,27 @@ class HttpFD(FileDownloader): # https://github.com/ytdl-org/youtube-dl/issues/6057#issuecomment-126129799) if has_range: content_range = ctx.data.headers.get('Content-Range') - if content_range: - content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) + content_range_start, content_range_end, content_len = parse_http_range(content_range) + if content_range_start is not None and range_start == content_range_start: # Content-Range is present and matches requested Range, resume is possible - if content_range_m: - if range_start == int(content_range_m.group(1)): - content_range_end = int_or_none(content_range_m.group(2)) - content_len = int_or_none(content_range_m.group(3)) - accept_content_len = ( - # Non-chunked download - not ctx.chunk_size - # Chunked download and requested piece or - # its part is promised to be served - or content_range_end == range_end - or content_len < range_end) - if accept_content_len: - ctx.data_len = content_len - return + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size + # Chunked download and requested piece or + # its part is promised to be served + or content_range_end == range_end + or content_len < range_end) + if accept_content_len: + ctx.content_len = content_len + ctx.data_len = min(content_len, req_end or content_len) - (req_start or 0) + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' - ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) - return + ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) except (compat_urllib_error.HTTPError, ) as err: if err.code == 416: # Unable to resume (requested range not satisfiable) @@ -191,14 +195,16 @@ class HttpFD(FileDownloader): # Unexpected HTTP error raise raise RetryDownload(err) - except socket.error as err: - if err.errno != errno.ECONNRESET: - # Connection reset is no problem, just retry + except compat_urllib_error.URLError as err: + if isinstance(err.reason, ssl.CertificateError): raise raise RetryDownload(err) + # In urllib.request.AbstractHTTPHandler, the response is partially read on request. + # Any errors that occur during this will not be wrapped by URLError + except RESPONSE_READ_EXCEPTIONS as err: + raise RetryDownload(err) def download(): - nonlocal throttle_start data_len = ctx.data.info().get('Content-length', None) # Range HTTP header may be ignored/unsupported by a webserver @@ -241,16 +247,8 @@ class HttpFD(FileDownloader): try: # Download and write data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - # socket.timeout is a subclass of socket.error but may not have - # errno set - except socket.timeout as e: - retry(e) - except socket.error as e: - # SSLError on python 2 (inherits socket.error) may have - # no errno set but this error message - if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out': - retry(e) - raise + except RESPONSE_READ_EXCEPTIONS as err: + retry(err) byte_counter += len(data_block) @@ -261,7 +259,7 @@ class HttpFD(FileDownloader): # Open destination file just in time if ctx.stream is None: try: - ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.stream, ctx.tmpfilename = self.sanitize_open( ctx.tmpfilename, ctx.open_mode) assert ctx.stream is not None ctx.filename = self.undo_temp_name(ctx.tmpfilename) @@ -321,16 +319,16 @@ class HttpFD(FileDownloader): if speed and speed < (self.params.get('throttledratelimit') or 0): # The speed must stay below the limit for 3 seconds # This prevents raising error when the speed temporarily goes down - if throttle_start is None: - throttle_start = now - elif now - throttle_start > 3: + if ctx.throttle_start is None: + ctx.throttle_start = now + elif now - ctx.throttle_start > 3: if ctx.stream is not None and ctx.tmpfilename != '-': ctx.stream.close() raise ThrottledDownload() elif speed: - throttle_start = None + ctx.throttle_start = None - if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: ctx.resume_len = byte_counter # ctx.block_size = block_size raise NextFragment() diff --git a/hypervideo_dl/downloader/ism.py b/hypervideo_dl/downloader/ism.py index 09516ab..4d5618c 100644 --- a/hypervideo_dl/downloader/ism.py +++ b/hypervideo_dl/downloader/ism.py @@ -263,9 +263,11 @@ class IsmFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + success = self._download_fragment(ctx, segment['url'], info_dict) if not success: return False + frag_content = self._read_fragment(ctx) + if not extra_state['ism_track_written']: tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] diff --git a/hypervideo_dl/downloader/mhtml.py b/hypervideo_dl/downloader/mhtml.py index f0f4dc6..c8332c0 100644 --- a/hypervideo_dl/downloader/mhtml.py +++ b/hypervideo_dl/downloader/mhtml.py @@ -114,8 +114,8 @@ body > figure > img { fragment_base_url = info_dict.get('fragment_base_url') fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] - title = info_dict['title'] - origin = info_dict['webpage_url'] + title = info_dict.get('title', info_dict['format_id']) + origin = info_dict.get('webpage_url', info_dict['url']) ctx = { 'filename': filename, @@ -166,10 +166,15 @@ body > figure > img { if (i + 1) <= ctx['fragment_index']: continue - fragment_url = urljoin(fragment_base_url, fragment['path']) - success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + + success = self._download_fragment(ctx, fragment_url, info_dict) if not success: continue + frag_content = self._read_fragment(ctx) mime_type = b'image/jpeg' if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): diff --git a/hypervideo_dl/downloader/rtmp.py b/hypervideo_dl/downloader/rtmp.py index 6dca647..90f1acf 100644 --- a/hypervideo_dl/downloader/rtmp.py +++ b/hypervideo_dl/downloader/rtmp.py @@ -12,6 +12,7 @@ from ..utils import ( encodeFilename, encodeArgument, get_exe_version, + Popen, ) @@ -26,7 +27,7 @@ class RtmpFD(FileDownloader): start = time.time() resume_percent = None resume_downloaded_data_len = None - proc = subprocess.Popen(args, stderr=subprocess.PIPE) + proc = Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True proc_stderr_closed = False try: diff --git a/hypervideo_dl/downloader/websocket.py b/hypervideo_dl/downloader/websocket.py index 0882220..58e2bce 100644 --- a/hypervideo_dl/downloader/websocket.py +++ b/hypervideo_dl/downloader/websocket.py @@ -5,9 +5,12 @@ import threading try: import websockets - has_websockets = True -except ImportError: +except (ImportError, SyntaxError): + # websockets 3.10 on python 3.6 causes SyntaxError + # See https://github.com/hypervideo/hypervideo/issues/2633 has_websockets = False +else: + has_websockets = True from .common import FileDownloader from .external import FFmpegFD diff --git a/hypervideo_dl/downloader/youtube_live_chat.py b/hypervideo_dl/downloader/youtube_live_chat.py index ef4205e..dd21ac8 100644 --- a/hypervideo_dl/downloader/youtube_live_chat.py +++ b/hypervideo_dl/downloader/youtube_live_chat.py @@ -22,6 +22,9 @@ class YoutubeLiveChatFD(FragmentFD): def real_download(self, filename, info_dict): video_id = info_dict['video_id'] self.to_screen('[%s] Downloading live chat' % self.FD_NAME) + if not self.params.get('skip_download') and info_dict['protocol'] == 'youtube_live_chat': + self.report_warning('Live chat download runs until the livestream ends. ' + 'If you wish to download the video simultaneously, run a separate hypervideo instance') fragment_retries = self.params.get('fragment_retries', 0) test = self.params.get('test', False) @@ -112,9 +115,10 @@ class YoutubeLiveChatFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, raw_fragment = dl_fragment(url, request_data, headers) + success = dl_fragment(url, request_data, headers) if not success: return False, None, None, None + raw_fragment = self._read_fragment(ctx) try: data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) except RegexNotFoundError: @@ -142,9 +146,10 @@ class YoutubeLiveChatFD(FragmentFD): self._prepare_and_start_frag_download(ctx, info_dict) - success, raw_fragment = dl_fragment(info_dict['url']) + success = dl_fragment(info_dict['url']) if not success: return False + raw_fragment = self._read_fragment(ctx) try: data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) except RegexNotFoundError: diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index 198c4ae..b354842 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,14 +1,15 @@ -from __future__ import unicode_literals +import os from ..utils import load_plugins -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - _PLUGIN_CLASSES = {} -except ImportError: - _LAZY_LOADER = False +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + except ImportError: + pass if not _LAZY_LOADER: from .extractors import * @@ -19,8 +20,8 @@ if not _LAZY_LOADER: ] _ALL_CLASSES.append(GenericIE) - _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) - _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES def gen_extractor_classes(): diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 3e20216..6fe195e 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + dict_get, ExtractorError, js_to_json, int_or_none, @@ -212,7 +213,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('720', 'sd', 'sd-low'): + for sd in ('1080', '720', 'sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: @@ -233,8 +234,6 @@ class ABCIViewIE(InfoExtractor): }] is_live = video_params.get('livestream') == '1' - if is_live: - title = self._live_title(title) return { 'id': video_id, @@ -255,3 +254,65 @@ class ABCIViewIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowSeriesIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview:showseries' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P[^/]+)(?:/series/\d+)?$' + _GEO_COUNTRIES = ['AU'] + + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': '124870-1', + 'title': 'Series 1', + 'description': 'md5:93119346c24a7c322d446d8eece430ff', + 'series': 'Upper Middle Bogan', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$' + }, + 'playlist_count': 8, + }, { + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': 'CO1108V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 I\'m A Swan', + 'description': 'md5:7b676758c1de11a30b79b4d301e8da93', + 'series': 'Upper Middle Bogan', + 'uploader_id': 'abc1', + 'upload_date': '20210630', + 'timestamp': 1625036400, + }, + 'params': { + 'noplaylist': True, + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + webpage_data = self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', + webpage, 'initial state') + video_data = self._parse_json( + unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id) + video_data = video_data['route']['pageData']['_embedded'] + + highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) + if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): + return self.url_result(highlight, ie=ABCIViewIE.ie_key()) + + series = video_data['selectedSeries'] + return { + '_type': 'playlist', + 'entries': [self.url_result(episode['shareUrl']) + for episode in series['_embedded']['videoEpisodes']], + 'id': series.get('id'), + 'title': dict_get(series, ('title', 'displaySubtitle')), + 'description': series.get('description'), + 'series': dict_get(series, ('showTitle', 'displayTitle')), + 'season': dict_get(series, ('title', 'displaySubtitle')), + 'thumbnail': series.get('thumbnail'), + } diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py new file mode 100644 index 0000000..27b7d86 --- /dev/null +++ b/hypervideo_dl/extractor/abematv.py @@ -0,0 +1,476 @@ +import io +import json +import time +import hashlib +import hmac +import re +import struct +from base64 import urlsafe_b64encode +from binascii import unhexlify + +from .common import InfoExtractor +from ..aes import aes_ecb_decrypt +from ..compat import ( + compat_urllib_response, + compat_urllib_parse_urlparse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + decode_base, + int_or_none, + random_uuidv4, + request_to_url, + time_seconds, + update_url_query, + traverse_obj, + intlist_to_bytes, + bytes_to_intlist, + urljoin, +) + + +# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + +def add_opener(ydl, handler): + ''' Add a handler for opening URLs, like _download_webpage ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + ydl._opener.add_handler(handler) + + +def remove_opener(ydl, handler): + ''' + Remove handler(s) for opening URLs + @param handler Either handler object itself or handler type. + Specifying handler type will remove all handler which isinstance returns True. + ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + opener = ydl._opener + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + if isinstance(handler, (type, tuple)): + find_cp = lambda x: isinstance(x, handler) + else: + find_cp = lambda x: x is handler + + removed = [] + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i + 1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j + 1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = opener.handle_error.get(protocol, {}) + opener.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = opener.handle_open + elif condition == "response": + kind = protocol + lookup = opener.process_response + elif condition == "request": + kind = protocol + lookup = opener.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + handlers[:] = [x for x in handlers if not find_cp(x)] + + removed.append(x for x in handlers if find_cp(x)) + + if removed: + for x in opener.handlers: + if find_cp(x): + x.add_parent(None) + opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + + +class AbemaLicenseHandler(compat_urllib_request.BaseHandler): + handler_order = 499 + STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' + + def __init__(self, ie: 'AbemaTVIE'): + # the protcol that this should really handle is 'abematv-license://' + # abematv_license_open is just a placeholder for development purposes + # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 + setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) + self.ie = ie + + def _get_videokey_from_ticket(self, ticket): + to_show = self.ie._downloader.params.get('verbose', False) + media_token = self.ie._get_media_token(to_show=to_show) + + license_response = self.ie._download_json( + 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False, + query={'t': media_token}, + data=json.dumps({ + 'kv': 'a', + 'lt': ticket + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + + res = decode_base(license_response['k'], self.STRTABLE) + encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + + h = hmac.new( + unhexlify(self.HKEY), + (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), + digestmod=hashlib.sha256) + enckey = bytes_to_intlist(h.digest()) + + return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + + def abematv_license_open(self, url): + url = request_to_url(url) + ticket = compat_urllib_parse_urlparse(url).netloc + response_data = self._get_videokey_from_ticket(ticket) + return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + 'Content-Length': len(response_data), + }, url=url, code=200) + + +class AbemaTVBaseIE(InfoExtractor): + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + if jsonld.get('@type') != 'BreadcrumbList': + continue + trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if trav: + return trav + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _USERTOKEN = None + _DEVICE_ID = None + _TIMETABLE = None + _MEDIATOKEN = None + + _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' + + def _generate_aks(self, deviceid): + deviceid = deviceid.encode('utf-8') + # add 1 hour and then drop minute and secs + ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) + time_struct = time.gmtime(ts_1hour) + ts_1hour_str = str(ts_1hour).encode('utf-8') + + tmp = None + + def mix_once(nonce): + nonlocal tmp + h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h.update(nonce) + tmp = h.digest() + + def mix_tmp(count): + nonlocal tmp + for i in range(count): + mix_once(tmp) + + def mix_twist(nonce): + nonlocal tmp + mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + + mix_once(self._SECRETKEY) + mix_tmp(time_struct.tm_mon) + mix_twist(deviceid) + mix_tmp(time_struct.tm_mday % 5) + mix_twist(ts_1hour_str) + mix_tmp(time_struct.tm_hour % 5) + + return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + + def _get_device_token(self): + if self._USERTOKEN: + return self._USERTOKEN + + self._DEVICE_ID = random_uuidv4() + aks = self._generate_aks(self._DEVICE_ID) + user_data = self._download_json( + 'https://api.abema.io/v1/users', None, note='Authorizing', + data=json.dumps({ + 'deviceId': self._DEVICE_ID, + 'applicationKeySecret': aks, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + self._USERTOKEN = user_data['token'] + + # don't allow adding it 2 times or more, though it's guarded + remove_opener(self._downloader, AbemaLicenseHandler) + add_opener(self._downloader, AbemaLicenseHandler(self)) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False, to_show=True): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + self._MEDIATOKEN = self._download_json( + 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, + query={ + 'osName': 'android', + 'osVersion': '6.0.1', + 'osLang': 'ja_JP', + 'osTimezone': 'Asia/Tokyo', + 'appId': 'tv.abema', + 'appVersion': '3.27.1' + }, headers={ + 'Authorization': 'bearer ' + self._get_device_token() + })['token'] + + return self._MEDIATOKEN + + def _perform_login(self, username, password): + if '@' in username: # don't strictly check if it's email address or not + ep, method = 'user/email', 'email' + else: + ep, method = 'oneTimePassword', 'userId' + + login_response = self._download_json( + f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', + data=json.dumps({ + method: username, + 'password': password + }).encode('utf-8'), headers={ + 'Authorization': 'bearer ' + self._get_device_token(), + 'Origin': 'https://abema.tv', + 'Referer': 'https://abema.tv/', + 'Content-Type': 'application/json', + }) + + self._USERTOKEN = login_response['token'] + self._get_media_token(True) + + def _real_extract(self, url): + # starting download using infojson from this extractor is undefined behavior, + # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # (unless there's a way to hook before downloading by extractor) + video_id, video_type = self._match_valid_url(url).group('id', 'type') + headers = { + 'Authorization': 'Bearer ' + self._get_device_token(), + } + video_type = video_type.split('/')[-1] + + webpage = self._download_webpage(url, video_id) + canonical_url = self._search_regex( + r'(.+?)', webpage, 'title', default=None) + if not title: + jsonld = None + for jld in re.finditer( + r'(?is)(?:)?]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + break + if jsonld: + title = jsonld.get('caption') + if not title and video_type == 'now-on-air': + if not self._TIMETABLE: + # cache the timetable because it goes to 5MiB in size (!!) + self._TIMETABLE = self._download_json( + 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id, + headers=headers) + now = time_seconds(hours=9) + for slot in self._TIMETABLE.get('slots', []): + if slot.get('channelId') != video_id: + continue + if slot['startAt'] <= now and now < slot['endAt']: + title = slot['title'] + break + + # read breadcrumb on top of page + breadcrumb = self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + # breadcrumb list translates to: (example is 1st test for this IE) + # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) + # hence this works + info['series'] = breadcrumb[-2] + info['episode'] = breadcrumb[-1] + if not title: + title = info['episode'] + + description = self._html_search_regex( + (r'(.+?)

(.+?)[^?/]+)' + + _TESTS = [{ + 'url': 'https://abema.tv/video/title/90-1597', + 'info_dict': { + 'id': '90-1597', + 'title': 'シャッフルアイランド', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://abema.tv/video/title/193-132', + 'info_dict': { + 'id': '193-132', + 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', + }, + 'playlist_mincount': 16, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + playlist_title = breadcrumb[-1] + + playlist = [ + self.url_result(urljoin('https://abema.tv/', mobj.group(1))) + for mobj in re.finditer(r'', '{\\i1}').replace('', '{\\i0}')) @@ -133,10 +126,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles - def _real_initialize(self): - username, password = self._get_login_info() - if not username: - return + def _perform_login(self, username, password): try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index 728549e..e2e6f93 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] @@ -31,7 +31,7 @@ class AdobeConnectIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'is_live': is_live, } diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 9378c33..5d98301 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -39,8 +39,8 @@ MSO_INFO = { }, 'RCN': { 'name': 'RCN', - 'username_field': 'UserName', - 'password_field': 'UserPassword', + 'username_field': 'username', + 'password_field': 'password', }, 'Rogers': { 'name': 'Rogers', @@ -1345,6 +1345,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'Suddenlink': { + 'name': 'Suddenlink', + 'username_field': 'username', + 'password_field': 'password', + }, } @@ -1635,6 +1640,58 @@ class AdobePassIE(InfoExtractor): urlh.geturl(), video_id, 'Sending final bookend', query=hidden_data) + post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Suddenlink': + # Suddenlink is similar to SlingTV in using a tab history count and a meta refresh, + # but they also do a dynmaic redirect using javascript that has to be followed as well + first_bookend_page, urlh = post_form( + provider_redirect_page_res, 'Pressing Continue...') + + hidden_data = self._hidden_inputs(first_bookend_page) + hidden_data['history_val'] = 1 + + provider_login_redirect_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending First Bookend', + query=hidden_data) + + provider_login_redirect_page, urlh = provider_login_redirect_page_res + + # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already + # have the login prompt or not + if 'id="password" type="password" name="password"' in provider_login_redirect_page: + provider_login_page_res = provider_login_redirect_page_res + else: + provider_tryauth_url = self._html_search_regex( + r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + provider_tryauth_page = self._download_webpage( + provider_tryauth_url, video_id, 'Submitting TryAuth', + query=hidden_data) + + provider_login_page_res = self._download_webpage_handle( + f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', + video_id, 'Getting Login Page', + query=hidden_data) + + provider_association_redirect, urlh = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password + }) + + provider_refresh_redirect_url = extract_redirect_url( + provider_association_redirect, url=urlh.geturl()) + + last_bookend_page, urlh = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Auth Association Redirect Page') + + hidden_data = self._hidden_inputs(last_bookend_page) + hidden_data['history_val'] = 3 + + mvpd_confirm_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending Final Bookend', + query=hidden_data) + post_form(mvpd_confirm_page_res, 'Confirming Login') else: # Some providers (e.g. DIRECTV NOW) have another meta refresh diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py index 12b8192..3cfa1ff 100644 --- a/hypervideo_dl/extractor/adobetv.py +++ b/hypervideo_dl/extractor/adobetv.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 063872b..77f0e3c 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -10,7 +10,11 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + qualities, + traverse_obj, unified_strdate, + unified_timestamp, + update_url_query, url_or_none, urlencode_postdata, xpath_text, @@ -28,7 +32,7 @@ class AfreecaTVIE(InfoExtractor): /app/(?:index|read_ucc_bbs)\.cgi| /player/[Pp]layer\.(?:swf|html) )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/PLAYER/STATION/ + vod\.afreecatv\.com/(PLAYER/STATION|player)/ ) (?P\d+) ''' @@ -166,6 +170,9 @@ class AfreecaTVIE(InfoExtractor): }, { 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/player/15055030', + 'only_matching': True, }] @staticmethod @@ -177,14 +184,7 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'szWork': 'login', 'szType': 'json', @@ -380,3 +380,105 @@ class AfreecaTVIE(InfoExtractor): }) return info + + +class AfreecaTVLiveIE(AfreecaTVIE): + + IE_NAME = 'afreecatv:live' + _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' + _TESTS = [{ + 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'info_dict': { + 'id': '237852185', + 'ext': 'mp4', + 'title': '【 우루과이 오늘은 무슨일이? 】', + 'uploader': '박진우[JINU]', + 'uploader_id': 'pyh3646', + 'timestamp': 1640661495, + 'is_live': True, + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'http://play.afreeca.com/pyh3646/237852185', + 'only_matching': True, + }, { + 'url': 'http://play.afreeca.com/pyh3646', + 'only_matching': True, + }] + + _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + + _QUALITIES = ('sd', 'hd', 'hd2k', 'original') + + def _real_extract(self, url): + broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') + password = self.get_param('videopassword') + + info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False, + data=urlencode_postdata({'bid': broadcaster_id})) or {} + channel_info = info.get('CHANNEL') or {} + broadcaster_id = channel_info.get('BJID') or broadcaster_id + broadcast_no = channel_info.get('BNO') or broadcast_no + password_protected = channel_info.get('BPWD') + if not broadcast_no: + raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True) + if password_protected == 'Y' and password is None: + raise ExtractorError( + 'This livestream is protected by a password, use the --video-password option', + expected=True) + + formats = [] + quality_key = qualities(self._QUALITIES) + for quality_str in self._QUALITIES: + params = { + 'bno': broadcast_no, + 'stream_type': 'common', + 'type': 'aid', + 'quality': quality_str, + } + if password is not None: + params['pwd'] = password + aid_response = self._download_json( + self._LIVE_API_URL, broadcast_no, fatal=False, + data=urlencode_postdata(params), + note=f'Downloading access token for {quality_str} stream', + errnote=f'Unable to download access token for {quality_str} stream') + aid = traverse_obj(aid_response, ('CHANNEL', 'AID')) + if not aid: + continue + + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + stream_info = self._download_json( + f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False, + query={ + 'return_type': channel_info.get('CDN', 'gcp_cdn'), + 'broad_key': f'{broadcast_no}-common-{quality_str}-hls', + }, + note=f'Downloading metadata for {quality_str} stream', + errnote=f'Unable to download metadata for {quality_str} stream') or {} + + if stream_info.get('view_url'): + formats.append({ + 'format_id': quality_str, + 'url': update_url_query(stream_info['view_url'], {'aid': aid}), + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': quality_key(quality_str), + }) + + self._sort_formats(formats) + + station_info = self._download_json( + 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, + query={'szBjId': broadcaster_id}, fatal=False, + note='Downloading channel metadata', errnote='Unable to download channel metadata') or {} + + return { + 'id': broadcast_no, + 'title': channel_info.get('TITLE') or station_info.get('station_title'), + 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'), + 'uploader_id': broadcaster_id, + 'timestamp': unified_timestamp(station_info.get('broad_start')), + 'formats': formats, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 6f241e6..9722fe9 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index e829b45..7bcdb7a 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?P\w+\.aljazeera\.\w+)/(?Pprograms?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P\d+)/(?P[a-zA-Z0-9]+)_(?P[^/]+)/index.html\?videoId=(?P\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index cd533ac..403a277 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)(.+?)', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py new file mode 100644 index 0000000..4aae6fe --- /dev/null +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVBaseIE(InfoExtractor): + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info.get('titre') + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + +class Alsace20TVIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index f5325de..d2e2df2 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -74,14 +74,7 @@ class AluraIE(InfoExtractor): "formats": formats } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - pass + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py new file mode 100644 index 0000000..07b1b18 --- /dev/null +++ b/hypervideo_dl/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py index 4fb7ee4..1c2cc47 100644 --- a/hypervideo_dl/extractor/animelab.py +++ b/hypervideo_dl/extractor/animelab.py @@ -15,25 +15,21 @@ from ..compat import compat_HTTPError class AnimeLabBaseIE(InfoExtractor): - _LOGIN_REQUIRED = True _LOGIN_URL = 'https://www.animelab.com/login' _NETRC_MACHINE = 'animelab' + _LOGGED_IN = False - def _login(self): - def is_logged_in(login_webpage): - return 'Sign In' not in login_webpage + def _is_logged_in(self, login_page=None): + if not self._LOGGED_IN: + if not login_page: + login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page + return self._LOGGED_IN - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - # Check if already logged in - if is_logged_in(login_page): + def _perform_login(self, username, password): + if self._is_logged_in(): return - (username, password) = self._get_login_info() - if username is None and self._LOGIN_REQUIRED: - self.raise_login_required('Login is required to access any AnimeLab content') - login_form = { 'email': username, 'password': password, @@ -47,17 +43,14 @@ class AnimeLabBaseIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - else: - raise + raise - # if login was successful - if is_logged_in(response): - return - - raise ExtractorError('Unable to login (cannot verify if logged in)') + if not self._is_logged_in(response): + raise ExtractorError('Unable to login (cannot verify if logged in)') def _real_initialize(self): - self._login() + if not self._is_logged_in(): + self.raise_login_required('Login is required to access any AnimeLab content') class AnimeLabIE(AnimeLabBaseIE): diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py index 54e097d..2e674d5 100644 --- a/hypervideo_dl/extractor/animeondemand.py +++ b/hypervideo_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -52,11 +53,7 @@ class AnimeOnDemandIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -92,9 +89,6 @@ class AnimeOnDemandIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - def _real_extract(self, url): anime_id = self._match_id(url) @@ -140,15 +134,8 @@ class AnimeOnDemandIE(InfoExtractor): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +182,10 @@ class AnimeOnDemandIE(InfoExtractor): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py new file mode 100644 index 0000000..1075b46 --- /dev/null +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + scale_thumbnails_to_max_format_width, + unescapeHTML, +) + + +class Ant1NewsGrBaseIE(InfoExtractor): + def _download_and_extract_api_data(self, video_id, netloc, cid=None): + url = f'{self.http_scheme()}//{netloc}{self._API_PATH}' + info = self._download_json(url, video_id, query={'cid': cid or video_id}) + try: + source = info['url'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') + if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) + self._sort_formats(formats) + thumbnails = scale_thumbnails_to_max_format_width( + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') + return { + 'id': video_id, + 'title': info.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subs, + } + + +class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:watch' + IE_DESC = 'ant1news.gr videos' + _VALID_URL = r'https?://(?P(?:www\.)?ant1news\.gr)/watch/(?P\d+)/' + _API_PATH = '/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', + 'md5': '95925e6b32106754235f2417e0d2dfab', + 'info_dict': { + 'id': '1506168', + 'ext': 'mp4', + 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', + 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg', + }, + }] + + def _real_extract(self, url): + video_id, netloc = self._match_valid_url(url).group('id', 'netloc') + webpage = self._download_webpage(url, video_id) + info = self._download_and_extract_api_data(video_id, netloc) + info['description'] = self._og_search_description(webpage) + return info + + +class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:article' + IE_DESC = 'ant1news.gr articles' + _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P\d+)/' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron', + 'md5': '294f18331bb516539d72d85a82887dcc', + 'info_dict': { + 'id': '_xvg/m_cmbatw=', + 'ext': 'mp4', + 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411', + 'timestamp': 1603092840, + 'upload_date': '20201019', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg', + }, + }, { + 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn', + 'info_dict': { + 'id': '620286', + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + 'playlist_mincount': 2, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') + embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if not embed_urls: + raise ExtractorError('no videos found for %s' % video_id, expected=True) + return self.playlist_from_matches( + embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(), + video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) + + +class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:embed' + IE_DESC = 'ant1news.gr embedded videos' + _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' + _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P[^#&]+)' + _API_PATH = '/news/templates/data/jsonPlayer' + + _TESTS = [{ + 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377', + 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3', + 'info_dict': { + 'id': '3f_li_c_az_jw_y_u=', + 'ext': 'mp4', + 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + _EMBED_RE = rf']+?src=(?P<_q1>["\'])(?P{_EMBED_URL_RE})(?P=_q1)' + for mobj in re.finditer(_EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + + canonical_url = self._request_webpage( + HEADRequest(url), video_id, + note='Resolve canonical player URL', + errnote='Could not resolve canonical player URL').geturl() + _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) + cid = urllib.parse.parse_qs(query)['cid'][0] + + return self._download_and_extract_api_data(video_id, netloc, cid=cid) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index b82f0b5..686d453 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index da06a3c..1057233 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -33,19 +33,22 @@ class AparatIE(InfoExtractor): 'only_matching': True, }] + def _parse_options(self, webpage, video_id, fatal=True): + return self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id) + def _real_extract(self, url): video_id = self._match_id(url) - # Provides more metadata + # If available, provides more metadata webpage = self._download_webpage(url, video_id, fatal=False) + options = self._parse_options(webpage, video_id, fatal=False) - if not webpage: + if not options: webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) - - options = self._parse_json(self._search_regex( - r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) + video_id, 'Downloading embed webpage') + options = self._parse_options(webpage, video_id) formats = [] for sources in (options.get('multiSRC') or []): diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 6a74de7..9139ff7 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, - 'title': episode['name'], + 'title': episode.get('name'), 'url': clean_podcast_url(episode['assetUrl']), 'description': description.get('standard') or description.get('short'), 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), + 'vcodec': 'none', } diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index d90fcb1..2ab3c1b 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -3,33 +3,37 @@ from __future__ import unicode_literals import re import json - from .common import InfoExtractor -from .youtube import YoutubeIE +from .youtube import YoutubeIE, YoutubeBaseInfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_HTTPError ) from ..utils import ( + bug_reports_message, clean_html, - determine_ext, dict_get, extract_attributes, ExtractorError, + get_element_by_id, HEADRequest, int_or_none, + join_nonempty, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, + orderedSet, parse_duration, parse_qs, - RegexNotFoundError, str_to_int, str_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, + urlhandle_detect_ext, + url_or_none ) @@ -61,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, - 'upload_date': "20131222", + 'upload_date': '20131222', }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', @@ -147,8 +151,7 @@ class ArchiveOrgIE(InfoExtractor): # Archive.org metadata API doesn't clearly demarcate playlist entries # or subtitle tracks, so we get them from the embeddable player. - embed_page = self._download_webpage( - 'https://archive.org/embed/' + identifier, identifier) + embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier) playlist = self._playlist_data(embed_page) entries = {} @@ -163,17 +166,17 @@ class ArchiveOrgIE(InfoExtractor): 'thumbnails': [], 'artist': p.get('artist'), 'track': p.get('title'), - 'subtitles': {}} + 'subtitles': {}, + } for track in p.get('tracks', []): if track['kind'] != 'subtitles': continue - entries[p['orig']][track['label']] = { - 'url': 'https://archive.org/' + track['file'].lstrip('/')} + 'url': 'https://archive.org/' + track['file'].lstrip('/') + } - metadata = self._download_json( - 'http://archive.org/metadata/' + identifier, identifier) + metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier) m = metadata['metadata'] identifier = m['identifier'] @@ -186,7 +189,7 @@ class ArchiveOrgIE(InfoExtractor): 'license': m.get('licenseurl'), 'release_date': unified_strdate(m.get('date')), 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), - 'webpage_url': 'https://archive.org/details/' + identifier, + 'webpage_url': f'https://archive.org/details/{identifier}', 'location': m.get('venue'), 'release_year': int_or_none(m.get('year'))} @@ -204,7 +207,7 @@ class ArchiveOrgIE(InfoExtractor): 'discnumber': int_or_none(f.get('disc')), 'release_year': int_or_none(f.get('year'))}) entry = entries[f['name']] - elif f.get('original') in entries: + elif traverse_obj(f, 'original', expected_type=str) in entries: entry = entries[f['original']] else: continue @@ -227,13 +230,12 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size')), 'protocol': 'https'}) - # Sort available formats by filesize for entry in entries.values(): - entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1))) + self._sort_formats(entry['formats']) if len(entries) == 1: # If there's only one item, use it as the main info dict - only_video = entries[list(entries.keys())[0]] + only_video = next(iter(entries.values())) if entry_id: info = merge_dicts(only_video, info) else: @@ -258,19 +260,19 @@ class ArchiveOrgIE(InfoExtractor): class YoutubeWebArchiveIE(InfoExtractor): IE_NAME = 'web.archive:youtube' - IE_DESC = 'web.archive.org saved youtube videos' - _VALID_URL = r"""(?x)^ - (?:https?://)?web\.archive\.org/ - (?:web/)? - (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional - - (?:https?(?::|%3[Aa])//)? - (?: - (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL - |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url - ) - (?P[0-9A-Za-z_-]{11})(?:%26|\#|&|$) - """ + IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix' + _VALID_URL = r'''(?x)(?:(?Pytarchive:)| + (?:https?://)?web\.archive\.org/ + (?:web/)?(?:(?P[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional + (?:https?(?::|%3[Aa])//)?(?: + (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + ) + )(?P[0-9A-Za-z_-]{11}) + (?(prefix) + (?::(?P[0-9]{14}))?$| + (?:%26|[#&]|$) + )''' _TESTS = [ { @@ -278,141 +280,395 @@ class YoutubeWebArchiveIE(InfoExtractor): 'info_dict': { 'id': 'aYAGB11YrSs', 'ext': 'webm', - 'title': 'Team Fortress 2 - Sandviches!' + 'title': 'Team Fortress 2 - Sandviches!', + 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf', + 'upload_date': '20110926', + 'uploader': 'Zeurel', + 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', + 'duration': 32, + 'uploader_id': 'Zeurel', + 'uploader_url': 'http://www.youtube.com/user/Zeurel' } - }, - { + }, { # Internal link 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', 'info_dict': { 'id': '97t7Xj_iBv0', 'ext': 'mp4', - 'title': 'How Flexible Machines Could Save The World' + 'title': 'Why Machines That Bend Are Better', + 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c', + 'upload_date': '20190312', + 'uploader': 'Veritasium', + 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', + 'duration': 771, + 'uploader_id': '1veritasium', + 'uploader_url': 'http://www.youtube.com/user/1veritasium' } - }, - { - # Video from 2012, webm format itag 45. + }, { + # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. + # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', 'info_dict': { 'id': 'AkhihxRKcrs', 'ext': 'webm', - 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)', + 'upload_date': '20120712', + 'duration': 398, + 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', + 'uploader_id': 'machinima', + 'uploader_url': 'http://www.youtube.com/user/machinima' } - }, - { - # Old flash-only video. Webpage title starts with "YouTube - ". + }, { + # FLV video. Video file URL does not provide itag information 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', 'info_dict': { 'id': 'jNQXAC9IVRw', - 'ext': 'unknown_video', - 'title': 'Me at the zoo' + 'ext': 'flv', + 'title': 'Me at the zoo', + 'upload_date': '20050423', + 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A', + 'duration': 19, + 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', + 'uploader_id': 'jawed', + 'uploader_url': 'http://www.youtube.com/user/jawed' } - }, - { - # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" - # Title has some weird unicode characters too. + }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', 'ext': 'flv', - 'title': '‪Madeon - Pop Culture (live mashup)‬‏' + 'title': 'Madeon - Pop Culture (live mashup)', + 'upload_date': '20110711', + 'uploader': 'Madeon', + 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w', + 'duration': 204, + 'description': 'md5:f7535343b6eda34a314eff8b85444680', + 'uploader_id': 'itsmadeon', + 'uploader_url': 'http://www.youtube.com/user/itsmadeon' } - }, - { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). - 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + }, { + # First capture is of dead video, second is the oldest from CDX response. + 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E', 'info_dict': { - 'id': 'kH-G_aIBlFw', + 'id': '1JYutPM8O6E', 'ext': 'mp4', - 'title': 'kH-G_aIBlFw' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { - # First capture is a 302 redirect intermediary page. - 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', + 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', + 'upload_date': '20160218', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 1236, + 'description': 'md5:21032bae736421e89c2edf36d1936947', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + } + }, { + # First capture of dead video, capture date in link links to dead capture. + 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E', 'info_dict': { - 'id': '0altSZ96U4M', + 'id': '6FPhZJGvf4E', 'ext': 'mp4', - 'title': '0altSZ96U4M' + 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', + 'upload_date': '20160219', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 798, + 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' }, 'expected_warnings': [ - 'unable to extract title', + r'unable to download capture webpage \(it may not be archived\)' ] - }, - { + }, { # Very old YouTube page, has - YouTube in title. + 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg', + 'info_dict': { + 'id': '-06-KB9XTzg', + 'ext': 'flv', + 'title': 'New Coin Hack!! 100% Safe!!' + } + }, { + 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8', + 'info_dict': { + 'id': 'dWW7qP423y8', + 'ext': 'mp4', + 'title': 'It\'s Bootleg AirPods Time.', + 'upload_date': '20211021', + 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'duration': 810, + 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'uploader': 'DankPods', + 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' + } + }, { + # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 + 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4', + 'info_dict': { + 'id': '6Dh-RL__uN4', + 'ext': 'mp4', + 'title': 'bitch lasagna', + 'upload_date': '20181005', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'duration': 135, + 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', + 'uploader': 'PewDiePie', + 'uploader_id': 'PewDiePie', + 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + } + }, { + 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M', + 'only_matching': True + }, { # Video not archived, only capture is unavailable video page 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', - 'only_matching': True, - }, - { # Encoded url + 'only_matching': True + }, { # Encoded url 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', - 'only_matching': True, - }, - { + 'only_matching': True + }, { 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', - 'only_matching': True, - } + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc:20050214000000', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc', + 'only_matching': True + }, ] + _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE + _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|= 2: + # format response to make it easier to use + return list(dict(zip(res[0], v)) for v in res[1:]) + elif not isinstance(res, list) or len(res) != 0: + self.report_warning('Error while parsing CDX API response' + bug_reports_message()) + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + def _extract_webpage_title(self, webpage): + page_title = self._html_extract_title(webpage, default='') + # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. + return self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + + def _extract_metadata(self, video_id, webpage): + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + + initial_data_video = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), + expected_type=dict, get_all=False, default={}) + + video_details = traverse_obj( + player_response, 'videoDetails', expected_type=dict, get_all=False, default={}) + + microformats = traverse_obj( + player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={}) + + video_title = ( + video_details.get('title') + or YoutubeBaseInfoExtractor._get_text(microformats, 'title') + or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or self._extract_webpage_title(webpage) + or search_meta(['og:title', 'twitter:title', 'title'])) + + channel_id = str_or_none( + video_details.get('channelId') + or microformats.get('externalChannelId') + or search_meta('channelId') + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', # @b45a9e6 + webpage, 'channel id', default=None, group='id')) + channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + + duration = int_or_none( + video_details.get('lengthSeconds') + or microformats.get('lengthSeconds') + or parse_duration(search_meta('duration'))) + description = ( + video_details.get('shortDescription') + or YoutubeBaseInfoExtractor._get_text(microformats, 'description') + or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 + or search_meta(['description', 'og:description', 'twitter:description'])) + + uploader = video_details.get('author') + + # Uploader ID and URL + uploader_mobj = re.search( + r'', # @fd05024 + webpage) + if uploader_mobj is not None: + uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') + else: + # @a6211d2 + uploader_url = url_or_none(microformats.get('ownerProfileUrl')) + uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) + + upload_date = unified_strdate( + dict_get(microformats, ('uploadDate', 'publishDate')) + or search_meta(['uploadDate', 'datePublished']) + or self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + webpage, 'upload date', default=None)) + + return { + 'title': video_title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'duration': duration, + 'uploader_url': uploader_url, + 'uploader_id': uploader_id, + } + + def _extract_thumbnails(self, video_id): + try_all = 'thumbnails' in self._configuration_arg('check_all') + thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format( + webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server) + for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))] + + thumbnails = [] + for url in thumbnail_base_urls: + response = self._call_cdx_api( + video_id, url, filters=['mimetype:image/(?:webp|jpeg)'], + collapse=['urlkey'], query={'matchType': 'prefix'}) + if not response: + continue + thumbnails.extend( + { + 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'), + 'filesize': int_or_none(thumbnail_dict.get('length')), + 'preference': int_or_none(thumbnail_dict.get('length')) + } for thumbnail_dict in response) + if not try_all: + break + + self._remove_duplicate_formats(thumbnails) + return thumbnails + + def _get_capture_dates(self, video_id, url_date): + capture_dates = [] + # Note: CDX API will not find watch pages with extra params in the url. + response = self._call_cdx_api( + video_id, f'https://www.youtube.com/watch?v={video_id}', + filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] + all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + + # Prefer the new polymer UI captures as we support extracting more metadata from them + # WBM captures seem to all switch to this layout ~July 2020 + modern_captures = [x for x in all_captures if x >= 20200701000000] + if modern_captures: + capture_dates.append(modern_captures[0]) + capture_dates.append(url_date) + if all_captures: + capture_dates.append(all_captures[0]) + + if 'captures' in self._configuration_arg('check_all'): + capture_dates.extend(modern_captures + all_captures) + + # Fallbacks if any of the above fail + capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE]) + return orderedSet(filter(None, capture_dates)) def _real_extract(self, url): - video_id = self._match_id(url) - title = video_id # if we are not able get a title - - def _extract_title(webpage): - page_title = self._html_search_regex( - r'([^<]*)', webpage, 'title', fatal=False) or '' - # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. - try: - page_title = self._html_search_regex( - r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', - page_title, 'title', default='') - except RegexNotFoundError: - page_title = None - - if not page_title: - self.report_warning('unable to extract title', video_id=video_id) - return - return page_title - - # If the video is no longer available, the oldest capture may be one before it was removed. - # Setting the capture date in url to early date seems to redirect to earliest capture. - webpage = self._download_webpage( - 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, - video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') - if webpage: - title = _extract_title(webpage) or title - - # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 - internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2') + url_date = url_date or url_date_2 + + urlh = None try: - video_file_webpage = self._request_webpage( - HEADRequest(internal_fake_url), video_id, - note='Fetching video file url', expected_status=True) + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - raise ExtractorError( - 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org', expected=True) - raise - video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) - video_file_url_qs = parse_qs(video_file_url) - - # Attempt to recover any ext & format info from playback url - format = {'url': video_file_url} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: # Naughty access but it works - format.update(YoutubeIE._formats[itag]) - format.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = mimetype2ext(mime) or determine_ext(video_file_url) - format.update({'ext': ext}) - return { - 'id': video_id, - 'title': title, - 'formats': [format], - 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) - } + else: + raise + + capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) + self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) + info = {'id': video_id} + for capture in capture_dates: + webpage = self._download_webpage( + (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id), + video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)', + note='Downloading capture webpage') + current_info = self._extract_metadata(video_id, webpage or '') + # Try avoid getting deleted video metadata + if current_info.get('title'): + info = merge_dicts(info, current_info) + if 'captures' not in self._configuration_arg('check_all'): + break + + info['thumbnails'] = self._extract_thumbnails(video_id) + + if urlh: + url = compat_urllib_parse_unquote(urlh.geturl()) + video_file_url_qs = parse_qs(url) + # Attempt to recover any ext & format info from playback url & response headers + format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = (mimetype2ext(mime) + or urlhandle_detect_ext(urlh) + or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) + format.update({'ext': ext}) + info['formats'] = [format] + if not info.get('duration'): + info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + + if not info.get('title'): + info['title'] = video_id + return info diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 5a9b818..8880e5c 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -124,8 +124,7 @@ class ArcPublishingIE(InfoExtractor): formats.extend(smil_formats) elif stream_type in ('ts', 'hls'): m3u8_formats = self._extract_m3u8_formats( - s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False) + s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False) if all([f.get('acodec') == 'none' for f in m3u8_formats]): continue for f in m3u8_formats: @@ -158,7 +157,7 @@ class ArcPublishingIE(InfoExtractor): return { 'id': uuid, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), 'description': try_get(video, lambda x: x['subheadlines']['basic']), 'formats': formats, diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 048d30f..7ea339b 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -280,7 +280,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): info.update({ 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, }) @@ -376,9 +376,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), @@ -388,7 +403,14 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?Pplayer|live|video|sendung|sammlung)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P[^/]+)/)? + (?:player|live|video|(?Psendung|sammlung))/ + (?:(?P(?(playlist)[^?#]+?|[^?#]+))/)? + (?P(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) + (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +425,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, + }, { + 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', + 'playlist_count': 6, + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', + 'title': 'beforeigners/beforeigners/staffel-1', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +467,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -522,23 +569,16 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): break pageNumber = pageNumber + 1 - return self.playlist_result(entries, playlist_title=display_id) + return self.playlist_result(entries, playlist_id, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client', 'season') + display_id, client = display_id or video_id, client or 'ard' + + if playlist_type: + # TODO: Extract only specified season + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +614,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index c0032fc..050c252 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + format_field, float_or_none, int_or_none, parse_iso8601, @@ -92,7 +93,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index 296b169..c2f2c1b 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_qs, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -137,6 +138,7 @@ class ArteTVIE(ArteTVBaseIE): break else: lang_pref = -1 + format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) media_type = f.get('mediaType') if media_type == 'hls': @@ -144,14 +146,17 @@ class ArteTVIE(ArteTVBaseIE): format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: - m3u8_format['language_preference'] = lang_pref + m3u8_format.update({ + 'language_preference': lang_pref, + 'format_note': format_note, + }) formats.extend(m3u8_formats) continue format = { 'format_id': format_id, 'language_preference': lang_pref, - 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'format_note': format_note, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), @@ -253,3 +258,44 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?P[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r']*?href\s*=\s*(?P"|\'|\b)(?Phttps?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r']*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, + description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 75a6329..7f1940f 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 8143eb4..465af4e 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { @@ -40,9 +37,6 @@ class AtresPlayerIE(InfoExtractor): ] _API_BASE = 'https://api.atresplayer.com/' - def _real_initialize(self): - self._login() - def _handle_error(self, e, code): if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: error = self._parse_json(e.cause.read(), None) @@ -51,11 +45,7 @@ class AtresPlayerIE(InfoExtractor): raise ExtractorError(error['error_description'], expected=True) raise - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._request_webpage( self._API_BASE + 'login', None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 7c30cfc..481a097 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -8,6 +8,7 @@ from ..utils import ( float_or_none, jwt_encode_hs256, try_get, + ExtractorError, ) @@ -94,6 +95,11 @@ class ATVAtIE(InfoExtractor): }) video_id, videos_data = list(videos['data'].items())[0] + error_msg = try_get(videos_data, lambda x: x['error']['title']) + if error_msg == 'Geo check failed': + self.raise_geo_restricted(error_msg) + elif error_msg: + raise ExtractorError(error_msg) entries = [ self._extract_video_info(url, contentResource[video['id']], video) for video in videos_data] diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index cc77713..19775cf 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', @@ -39,15 +40,16 @@ class AudiomackIE(InfoExtractor): 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', 'uploader': 'ILOVEMAKONNEN', 'upload_date': '20160414', - } + }, + 'skip': 'Song has been removed from the site', }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +75,13 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,27 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )', + 'id': '837576', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }, { + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], - 'params': { - 'playliststart': 9, - 'playlistend': 9, - } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +139,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index 22cc10d..f5e559c 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -9,6 +9,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + format_field, int_or_none, parse_iso8601, smuggle_url, @@ -41,9 +42,9 @@ class AWAANBaseIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index fee640e..0168340 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -11,11 +11,12 @@ class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|tv\.)? (?P telezueri\.ch| telebaern\.tv| - telem1\.ch + telem1\.ch| + tvo-online\.ch )/ [^/]+/ (?P @@ -30,7 +31,7 @@ class AZMedienIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { 'id': '1_anruz3wy', 'ext': 'mp4', @@ -38,6 +39,9 @@ class AZMedienIE(InfoExtractor): 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, + 'view_count': int, + 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', + 'duration': 1930 }, 'params': { 'skip_download': True, diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py new file mode 100644 index 0000000..3d4d36e --- /dev/null +++ b/hypervideo_dl/extractor/banbye.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import math + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + format_field, + InAdvancePagedList, + traverse_obj, + unified_timestamp, +) + + +class BanByeBaseIE(InfoExtractor): + _API_BASE = 'https://api.banbye.com' + _CDN_BASE = 'https://cdn.banbye.com' + _VIDEO_BASE = 'https://banbye.com/watch' + + @staticmethod + def _extract_playlist_id(url, param='playlist'): + return compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get(param, [None])[0] + + def _extract_playlist(self, playlist_id): + data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id) + return self.playlist_result([ + self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE) + for video_id in data['videoIds']], playlist_id, data.get('name')) + + +class BanByeIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', + 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', + 'info_dict': { + 'id': 'v_ytfmvkVYLE8T', + 'ext': 'mp4', + 'title': 'md5:5ec098f88a0d796f987648de6322ba0f', + 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'timestamp': 1647604800, + 'upload_date': '20220318', + 'duration': 1931, + 'thumbnail': r're:https?://.*\.webp', + 'tags': 'count:5', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url, 'playlistId') + + if self._yes_playlist(playlist_id, video_id): + return self._extract_playlist(playlist_id) + + data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id) + thumbnails = [{ + 'id': f'{quality}p', + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', + } for quality in [48, 96, 144, 240, 512, 1080]] + formats = [{ + 'format_id': f'http-{quality}p', + 'quality': quality, + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', + } for quality in data['quality']] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('desc'), + 'uploader': traverse_obj(data, ('channel', 'name')), + 'channel_id': data.get('channelId'), + 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'), + 'timestamp': unified_timestamp(data.get('publishedAt')), + 'duration': data.get('duration'), + 'tags': data.get('tags'), + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': data.get('likes'), + 'dislike_count': data.get('dislikes'), + 'view_count': data.get('views'), + 'comment_count': data.get('commentCount'), + } + + +class BanByeChannelIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/channel/ch_wrealu24', + 'info_dict': { + 'title': 'wRealu24', + 'id': 'ch_wrealu24', + 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6', + }, + 'playlist_mincount': 791, + }, { + 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + _PAGE_SIZE = 100 + + def _real_extract(self, url): + channel_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url) + + if playlist_id: + return self._extract_playlist(playlist_id) + + def page_func(page_num): + data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={ + 'channelId': channel_id, + 'sort': 'new', + 'limit': self._PAGE_SIZE, + 'offset': page_num * self._PAGE_SIZE, + }, note=f'Downloading page {page_num+1}') + return [ + self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) + for video in data['items'] + ] + + channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id) + entries = InAdvancePagedList( + page_func, + math.ceil(channel_data['videoCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, channel_id, channel_data.get('name'), channel_data.get('description')) diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index d672859..f1bcdef 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index b664145..745055e 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -183,6 +183,7 @@ class BandcampIE(InfoExtractor): 'format_note': f.get('description'), 'filesize': parse_filesize(f.get('size_mb')), 'vcodec': 'none', + 'acodec': format_id.split('-')[0], }) self._sort_formats(formats) @@ -212,7 +213,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -257,14 +258,6 @@ class BandcampAlbumIE(BandcampIE): 'id': 'hierophany-of-the-open-grave', }, 'playlist_mincount': 9, - }, { - 'url': 'http://dotscale.bandcamp.com', - 'info_dict': { - 'title': 'Loom', - 'id': 'dotscale', - 'uploader_id': 'dotscale', - }, - 'playlist_mincount': 7, }, { # with escaped quote in title 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', @@ -391,41 +384,63 @@ class BandcampWeeklyIE(BandcampIE): } -class BandcampMusicIE(InfoExtractor): - _VALID_URL = r'https?://(?P[^/]+)\.bandcamp\.com/music' +class BandcampUserIE(InfoExtractor): + IE_NAME = 'Bandcamp:user' + _VALID_URL = r'https?://(?!www\.)(?P[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)' + _TESTS = [{ + # Type 1 Bandcamp user page. + 'url': 'https://adrianvonziegler.bandcamp.com', + 'info_dict': { + 'id': 'adrianvonziegler', + 'title': 'Discography of adrianvonziegler', + }, + 'playlist_mincount': 23, + }, { + # Bandcamp user page with only one album + 'url': 'http://dotscale.bandcamp.com', + 'info_dict': { + 'id': 'dotscale', + 'title': 'Discography of dotscale' + }, + 'playlist_count': 1, + }, { + # Type 2 Bandcamp user page. + 'url': 'https://nightcallofficial.bandcamp.com', + 'info_dict': { + 'id': 'nightcallofficial', + 'title': 'Discography of nightcallofficial', + }, + 'playlist_count': 4, + }, { 'url': 'https://steviasphere.bandcamp.com/music', 'playlist_mincount': 47, 'info_dict': { 'id': 'steviasphere', + 'title': 'Discography of steviasphere', }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', 'playlist_mincount': 10, 'info_dict': { 'id': 'coldworldofficial', + 'title': 'Discography of coldworldofficial', }, }, { 'url': 'https://nuclearwarnowproductions.bandcamp.com/music', 'playlist_mincount': 399, 'info_dict': { 'id': 'nuclearwarnowproductions', + 'title': 'Discography of nuclearwarnowproductions', }, - } - ] - - _TYPE_IE_DICT = { - 'album': BandcampAlbumIE.ie_key(), - 'track': BandcampIE.ie_key() - } + }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - items = re.findall(r'href\=\"\/(?P(?Palbum|track)+/[^\"]+)', webpage) - entries = [ - self.url_result( - f'https://{id}.bandcamp.com/{item[0]}', - ie=self._TYPE_IE_DICT[item[1]]) - for item in items] - return self.playlist_result(entries, id) + uploader = self._match_id(url) + webpage = self._download_webpage(url, uploader) + + discography_data = (re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + return self.playlist_from_matches( + discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 4e2dcd7..29ad7de 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_str, + compat_urllib_error, compat_urlparse, ) from ..utils import ( @@ -38,7 +39,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -263,11 +264,7 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading signin page') @@ -293,9 +290,6 @@ class BBCCoUkIE(InfoExtractor): 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -394,9 +388,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -451,9 +453,10 @@ class BBCCoUkIE(InfoExtractor): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +466,17 @@ class BBCCoUkIE(InfoExtractor): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []).extend(subformats) + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise @@ -774,21 +786,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -882,9 +906,8 @@ class BBCIE(BBCCoUkIE): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(.+?)', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() @@ -1161,9 +1184,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1204,7 +1234,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 8fbabe7..717fff3 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,32 +1,45 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, -) + from ..utils import ( int_or_none, - parse_qs, + traverse_obj, + try_get, unified_timestamp, ) class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P\d+)' _TESTS = [{ - # api/v6 v1 - 'url': 'http://beeg.com/5416503', - 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'url': 'https://beeg.com/-0983946056129650', + 'md5': '51d235147c4627cfce884f844293ff88', 'info_dict': { - 'id': '5416503', + 'id': '0983946056129650', 'ext': 'mp4', - 'title': 'Sultry Striptease', - 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', - 'timestamp': 1391813355, - 'upload_date': '20140207', - 'duration': 383, + 'title': 'sucked cock and fucked in a private plane', + 'duration': 927, 'tags': list, 'age_limit': 18, + 'upload_date': '20220131', + 'timestamp': 1643656455, + 'display_id': 2540839, + } + }, { + 'url': 'https://beeg.com/-0599050563103750?t=4-861', + 'md5': 'bd8b5ea75134f7f07fad63008db2060e', + 'info_dict': { + 'id': '0599050563103750', + 'ext': 'mp4', + 'title': 'Bad Relatives', + 'duration': 2060, + 'tags': list, + 'age_limit': 18, + 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', + 'timestamp': 1643623200, + 'display_id': 2569965, + 'upload_date': '20220131', } }, { # api/v6 v2 @@ -36,12 +49,6 @@ class BeegIE(InfoExtractor): # api/v6 v2 w/o t 'url': 'https://beeg.com/1277207756', 'only_matching': True, - }, { - 'url': 'https://beeg.porn/video/5416503', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/5416503', - 'only_matching': True, }] def _real_extract(self, url): @@ -49,68 +56,38 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - beeg_version = self._search_regex( - r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', - default='1546225636701') + video = self._download_json( + 'https://store.externulls.com/facts/file/%s' % video_id, + video_id, 'Downloading JSON for %s' % video_id) - if len(video_id) >= 10: - query = { - 'v': 2, - } - qs = parse_qs(url) - t = qs.get('t', [''])[0].split('-') - if len(t) > 1: - query.update({ - 's': t[0], - 'e': t[1], - }) - else: - query = {'v': 1} + fc_facts = video.get('fc_facts') + first_fact = {} + for fact in fc_facts: + if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']): + first_fact = fact - for api_path in ('', 'api.'): - video = self._download_json( - 'https://%sbeeg.com/api/v6/%s/video/%s' - % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.', query=query) - if video: - break + resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources') formats = [] - for format_id, video_url in video.items(): - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - if not height: + for format_id, video_uri in resources.items(): + if not video_uri: continue - formats.append({ - 'url': self._proto_relative_url( - video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), - 'format_id': format_id, - 'height': int(height), - }) - self._sort_formats(formats) - - title = video['title'] - video_id = compat_str(video.get('id') or video_id) - display_id = video.get('code') - description = video.get('desc') - series = video.get('ps_name') + height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None)) + current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height)) + for f in current_formats: + f['height'] = height + formats.extend(current_formats) - timestamp = unified_timestamp(video.get('date')) - duration = int_or_none(video.get('duration')) - - tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'timestamp': timestamp, - 'duration': duration, - 'tags': tags, + 'display_id': first_fact.get('id'), + 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), + 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), + 'timestamp': unified_timestamp(first_fact.get('fc_created')), + 'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))), + 'tags': traverse_obj(video, ('tags', ..., 'tg_name')), 'formats': formats, 'age_limit': self._rta_search(webpage), } diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py new file mode 100644 index 0000000..ddf76ac --- /dev/null +++ b/hypervideo_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 8d66b43..909f7f8 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,5 +1,6 @@ # coding: utf-8 +import base64 import hashlib import itertools import functools @@ -14,19 +15,21 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + filter_dict, int_or_none, float_or_none, + mimetype2ext, parse_iso8601, traverse_obj, - try_get, + parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, - str_to_int, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -50,16 +53,14 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av1074402/', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402', - 'ext': 'flv', + 'id': '1074402_part1', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', + 'uploader_id': '156160', + 'uploader': '菊子桑', + 'upload_date': '20140420', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, 'timestamp': 1398012678, - 'upload_date': '20140420', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': '菊子桑', - 'uploader_id': '156160', }, }, { # Tested in BiliBiliBangumiIE @@ -73,49 +74,27 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', 'md5': '3f721ad1e75030cc06faf73587cfec57', 'info_dict': { - 'id': '100643', + 'id': '100643_part1', 'ext': 'mp4', 'title': 'CHAOS;CHILD', 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', }, 'skip': 'Geo-restricted to China', }, { - # Title with double quotes 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802', + 'id': '8903802_part1', + 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', + 'upload_date': '20170301', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'timestamp': 1488382634, + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + }, + 'params': { + 'skip_download': True, }, - 'playlist': [{ - 'info_dict': { - 'id': '8903802_part1', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': '8903802_part2', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }] }, { # new BV video id format 'url': 'https://www.bilibili.com/video/BV1JE411F741', @@ -150,6 +129,7 @@ class BiliBiliIE(InfoExtractor): av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) video_id = av_id + info = {} anime_id = mobj.group('anime_id') page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) @@ -201,66 +181,95 @@ class BiliBiliIE(InfoExtractor): } headers.update(self.geo_verification_headers()) + video_info = self._parse_json( + self._search_regex(r'window.__playinfo__\s*=\s*({.+?})', webpage, 'video info', default=None) or '{}', + video_id, fatal=False) + video_info = video_info.get('data') or {} + + durl = traverse_obj(video_info, ('dash', 'video')) + audios = traverse_obj(video_info, ('dash', 'audio')) or [] entries = [] RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') for num, rendition in enumerate(RENDITIONS, start=1): payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + if not video_info: + continue - if 'durl' not in video_info: + if not durl and 'durl' not in video_info: if num < len(RENDITIONS): continue self._report_error(video_info) - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): + formats = [] + for idx, durl in enumerate(durl or video_info['durl']): + formats.append({ + 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), + 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), + 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), + 'width': int_or_none(durl.get('width')), + 'height': int_or_none(durl.get('height')), + 'vcodec': durl.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), + 'filesize': int_or_none(durl.get('size')), + }) + for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: formats.append({ 'url': backup_url, - # backup URLs have lower priorities 'quality': -2 if 'hd.mp4' in backup_url else -3, }) - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, + for audio in audios: + formats.append({ + 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), + 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), + 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), + 'width': int_or_none(audio.get('width')), + 'height': int_or_none(audio.get('height')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + }) + for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'quality': -3, }) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) + info.update({ + 'id': video_id, + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + 'http_headers': { + 'Referer': url, + }, + }) break - title = self._html_search_regex( - (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + self._sort_formats(formats) + + title = self._html_search_regex(( + r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)', + r'(?s)<h1[^>]*>(?P<content>.+?)</h1>', + self._meta_regex('title') + ), webpage, 'title', group='content', fatal=False) # Get part title for anthologies if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video - part_title = try_get( - self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), - lambda x: x['data'][int(page_id) - 1]['part']) - title = part_title or title + # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. + part_info = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', + video_id, note='Extracting videos in anthology'), 'data', expected_type=list) + title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( @@ -270,15 +279,15 @@ class BiliBiliIE(InfoExtractor): thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript - info = { - 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id), + info.update({ + 'id': f'{video_id}_part{page_id or 1}', 'cid': cid, 'title': title, 'description': description, 'timestamp': timestamp, 'thumbnail': thumbnail, 'duration': float_or_none(video_info.get('timelength'), scale=1000), - } + }) uploader_mobj = re.search( r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<', @@ -299,7 +308,7 @@ class BiliBiliIE(InfoExtractor): video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), } - entries[0]['subtitles'] = { + info['subtitles'] = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', @@ -334,19 +343,18 @@ class BiliBiliIE(InfoExtractor): entry['id'] = '%s_part%d' % (video_id, (idx + 1)) return { - '_type': 'multi_video', 'id': str(video_id), 'bv_id': bv_id, 'title': title, 'description': description, - 'entries': entries, **info, **top_level_info } def _extract_anthology_entries(self, bv_id, video_id, webpage): title = self._html_search_regex( (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', + r'<title>(?P<title>.+?)'), webpage, 'title', group='title') json_data = self._download_json( f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', @@ -376,8 +384,10 @@ class BiliBiliIE(InfoExtractor): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}'), - ('data', 'replies')) or [] + video_id, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return for children in map(self._get_all_children, replies): yield from children @@ -477,9 +487,9 @@ class BilibiliChannelIE(InfoExtractor): data = self._download_json( self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] - max_count = max_count or try_get(data, lambda x: x['page']['count']) + max_count = max_count or traverse_obj(data, ('page', 'count')) - entries = try_get(data, lambda x: x['list']['vlist']) + entries = traverse_obj(data, ('list', 'vlist')) if not entries: return for entry in entries: @@ -517,7 +527,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url, query, query={'Search_key': query, 'pn': page_num}, note='Extracting results from page %s of %s' % (page_num, num_pages)) - video_list = try_get(parsed_json, lambda x: x['data']['archives'], list) + video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list) if not video_list: raise ExtractorError('Failed to retrieve video list for page %d' % page_num) @@ -547,7 +557,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'}) - page_data = try_get(page_json, lambda x: x['data']['page'], dict) + page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict) count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size')) if count is None or not size: raise ExtractorError('Failed to calculate either page count or size') @@ -566,7 +576,7 @@ class BilibiliCategoryIE(InfoExtractor): class BiliBiliSearchIE(SearchInfoExtractor): - IE_DESC = 'Bilibili video search, "bilisearch" keyword' + IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' @@ -719,40 +729,68 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): - _API_URL = 'https://api.bili{}/intl/gateway{}' - - def _call_api(self, type, endpoint, id): - return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + _API_URL = 'https://api.bilibili.tv/intl/gateway' + _NETRC_MACHINE = 'biliintl' + + def _call_api(self, endpoint, *args, **kwargs): + json = self._download_json(self._API_URL + endpoint, *args, **kwargs) + if json.get('code'): + if json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required() + elif json['code'] == 10004001: + self.raise_geo_restricted() + else: + if json.get('message') and str(json['code']) != json['message']: + errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}' + else: + errmsg = kwargs.get('errnote', 'Unable to download JSON metadata') + if kwargs.get('fatal'): + raise ExtractorError(errmsg) + else: + self.report_warning(errmsg) + return json.get('data') def json2srt(self, json): data = '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' - for i, line in enumerate(json['body'])) + for i, line in enumerate(json['body']) if line.get('content')) return data - def _get_subtitles(self, type, ep_id): - sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, *, ep_id=None, aid=None): + sub_json = self._call_api( + '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', + errnote='Unable to download subtitles list', query=filter_dict({ + 'platform': 'web', + 'episode_id': ep_id, + 'aid': aid, + })) subtitles = {} - for sub in sub_json.get('subtitles', []): + for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue - sub_data = self._download_json(sub_url, ep_id, fatal=False) + sub_data = self._download_json( + sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue - subtitles.setdefault(sub.get('key', 'en'), []).append({ + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ 'ext': 'srt', 'data': self.json2srt(sub_data) }) return subtitles - def _get_formats(self, type, ep_id): - video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) - if not video_json: - self.raise_login_required(method='cookies') + def _get_formats(self, *, ep_id=None, aid=None): + video_json = self._call_api( + '/web/playurl', ep_id or aid, note='Downloading video formats', + errnote='Unable to download video formats', query=filter_dict({ + 'platform': 'web', + 'ep_id': ep_id, + 'aid': aid, + })) video_json = video_json['playurl'] formats = [] - for vid in video_json.get('video', []): + for vid in video_json.get('video') or []: video_res = vid.get('video_resource') or {} video_info = vid.get('stream_info') or {} if not video_res.get('url'): @@ -768,7 +806,7 @@ class BiliIntlBaseIE(InfoExtractor): 'vcodec': video_res.get('codecs'), 'filesize': video_res.get('size'), }) - for aud in video_json.get('audio_resource', []): + for aud in video_json.get('audio_resource') or []: if not aud.get('url'): continue formats.append({ @@ -783,85 +821,148 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, type, episode_data, ep_id): + def _extract_video_info(self, video_data, *, ep_id=None, aid=None): return { - 'id': ep_id, - 'title': episode_data.get('long_title') or episode_data['title'], - 'thumbnail': episode_data.get('cover'), - 'episode_number': str_to_int(episode_data.get('title')), - 'formats': self._get_formats(type, ep_id), - 'subtitles': self._get_subtitles(type, ep_id), + 'id': ep_id or aid, + 'title': video_data.get('title_display') or video_data.get('title'), + 'thumbnail': video_data.get('cover'), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), 'extractor_key': BiliIntlIE.ie_key(), } + def _perform_login(self, username, password): + try: + from Cryptodome.PublicKey import RSA + from Cryptodome.Cipher import PKCS1_v1_5 + except ImportError: + try: + from Crypto.PublicKey import RSA + from Crypto.Cipher import PKCS1_v1_5 + except ImportError: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) + + key_data = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, + note='Downloading login key', errnote='Unable to download login key')['data'] + + public_key = RSA.importKey(key_data['key']) + password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + login_post = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ + 'username': username, + 'password': base64.b64encode(password_hash).decode('ascii'), + 'keep_me': 'true', + 's_locale': 'en_US', + 'isTrusted': 'true' + }), note='Logging in', errnote='Unable to log in') + if login_post.get('code'): + if login_post.get('message'): + raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True) + else: + raise ExtractorError('Unable to log in') + class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P\d+)/(?P\d+)|video/(?P\d+))' _TESTS = [{ + # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', 'info_dict': { 'id': '341736', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + } }, { - 'url': 'https://www.biliintl.com/en/play/34613/341736', + # Non-Bstation page + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', 'info_dict': { - 'id': '341736', + 'id': '11005006', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', - 'episode_number': 2, - }, - 'params': { - 'format': 'bv', + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } + }, { + # Subtitle with empty content + 'url': 'https://www.bilibili.tv/en/play/1005144/10131790', + 'info_dict': { + 'id': '10131790', + 'ext': 'mp4', + 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 140, }, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'only_matching': True, + }, { + # User-generated content (as opposed to a series licensed from a studio) + 'url': 'https://bilibili.tv/en/video/2019955076', + 'only_matching': True, + }, { + # No language in URL + 'url': 'https://www.bilibili.tv/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): - type, season_id, id = self._match_valid_url(url).groups() - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) - episode_data = next( - episode for episode in data_json.get('episodes', []) - if str(episode.get('ep_id')) == id) - return self._extract_ep_info(type, episode_data, id) + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + video_data = ( + traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + + if season_id and not video_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + video_data = traverse_obj(season_json, + ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), + expected_type=dict, get_all=False) + return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)$' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, }, 'params': { 'skip_download': True, - 'format': 'bv', }, }, { 'url': 'https://www.biliintl.com/en/play/34613', - 'playlist_mincount': 15, - 'info_dict': { - 'id': '34613', - }, - 'params': { - 'skip_download': True, - 'format': 'bv', - }, + 'only_matching': True, }] - def _entries(self, id, type): - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) - for episode in data_json.get('episodes', []): - episode_id = str(episode.get('ep_id')) - yield self._extract_ep_info(type, episode, episode_id) + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_video_info(episode, ep_id=episode_id) def _real_extract(self, url): - type, id = self._match_valid_url(url).groups() - return self.playlist_result(self._entries(id, type), playlist_id=id) + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) diff --git a/hypervideo_dl/extractor/biqle.py b/hypervideo_dl/extractor/biqle.py index 17ebbb2..2b57bad 100644 --- a/hypervideo_dl/extractor/biqle.py +++ b/hypervideo_dl/extractor/biqle.py @@ -3,27 +3,28 @@ from __future__ import unicode_literals from .common import InfoExtractor from .vk import VKIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + unified_timestamp, ) -from ..utils import int_or_none class BIQLEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P-?\d+_\d+)' _TESTS = [{ - # Youtube embed - 'url': 'https://biqle.ru/watch/-115995369_456239081', - 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06', + 'url': 'https://biqle.ru/watch/-2000421746_85421746', + 'md5': 'ae6ef4f04d19ac84e4658046d02c151c', 'info_dict': { - 'id': '8v4f-avW-VI', + 'id': '-2000421746_85421746', 'ext': 'mp4', - 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer", - 'description': 'Passe-Partout', - 'uploader_id': 'mrsimpsonstef3', - 'uploader': 'Phanolito', - 'upload_date': '20120822', + 'title': 'Forsaken By Hope Studio Clip', + 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн', + 'upload_date': '19700101', + 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb', + 'timestamp': 0, }, }, { 'url': 'http://biqle.org/watch/-44781847_168547604', @@ -32,53 +33,62 @@ class BIQLEIE(InfoExtractor): 'id': '-44781847_168547604', 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', + 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн', 'timestamp': 1396633454, - 'uploader': 'Dmitry Kotov', 'upload_date': '20140404', - 'uploader_id': '47850140', + 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embed_url = self._proto_relative_url(self._search_regex( - r'', - webpage, 'embed url')) + + title = self._html_search_meta('name', webpage, 'Title', fatal=False) + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + global_embed_url = self._search_regex( + r'', webpage, 'Hash') + + embed_url = global_embed_url + hash + if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) embed_page = self._download_webpage( - embed_url, video_id, headers={'Referer': url}) - video_ext = self._get_cookies(embed_url).get('video_ext') - if video_ext: - video_ext = compat_urllib_parse_unquote(video_ext.value) - if not video_ext: - video_ext = compat_b64decode(self._search_regex( - r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', - embed_page, 'video_ext')).decode() - video_id, sig, _, access_token = video_ext.split(':') + embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url}) + + glob_params = self._parse_json(self._search_regex( + r'', + embed_page, 'Global Parameters'), video_id, transform_source=js_to_json) + host_name = compat_b64decode(glob_params['server'][::-1]).decode() + item = self._download_json( - 'https://api.vk.com/method/video.get', video_id, - headers={'User-Agent': 'okhttp/3.4.1'}, query={ - 'access_token': access_token, - 'sig': sig, - 'v': 5.44, + f'https://{host_name}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': glob_params['video']['access_token'], 'videos': video_id, + 'ckey': glob_params['c_key'], + 'credentials': glob_params['video']['credentials'], })['response']['items'][0] - title = item['title'] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': return self.url_result(f_url) ext, height = f_id.split('_') - formats.append({ - 'format_id': height + 'p', - 'url': f_url, - 'height': int_or_none(height), - 'ext': ext, - }) + height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) self._sort_formats(formats) thumbnails = [] @@ -96,10 +106,9 @@ class BIQLEIE(InfoExtractor): 'title': title, 'formats': formats, 'comment_count': int_or_none(item.get('comments')), - 'description': item.get('description'), + 'description': description, 'duration': int_or_none(item.get('duration')), 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('date')), - 'uploader': item.get('owner_id'), + 'timestamp': timestamp, 'view_count': int_or_none(item.get('views')), } diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py index eb16c46..e6e093f 100644 --- a/hypervideo_dl/extractor/bitwave.py +++ b/hypervideo_dl/extractor/bitwave.py @@ -51,7 +51,7 @@ class BitwaveStreamIE(InfoExtractor): return { 'id': username, - 'title': self._live_title(channel['data']['title']), + 'title': channel['data']['title'], 'uploader': username, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/blogger.py b/hypervideo_dl/extractor/blogger.py new file mode 100644 index 0000000..dba131c --- /dev/null +++ b/hypervideo_dl/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P.+)' + _VALID_EMBED = r''']+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py index 9e75511..4e346e7 100644 --- a/hypervideo_dl/extractor/bongacams.py +++ b/hypervideo_dl/extractor/bongacams.py @@ -49,7 +49,7 @@ class BongaCamsIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(uploader or uploader_id), + 'title': uploader or uploader_id, 'uploader': uploader, 'uploader_id': uploader_id, 'like_count': like_count, diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py index 7169ece..0155827 100644 --- a/hypervideo_dl/extractor/br.py +++ b/hypervideo_dl/extractor/br.py @@ -175,7 +175,7 @@ class BRIE(InfoExtractor): class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?Pav:[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?Pav:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', @@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor): 'timestamp': 1511942766, 'upload_date': '20171129', } + }, { + 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/breitbart.py b/hypervideo_dl/extractor/breitbart.py new file mode 100644 index 0000000..e029aa6 --- /dev/null +++ b/hypervideo_dl/extractor/breitbart.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py index cd1c3f0..dcd332b 100644 --- a/hypervideo_dl/extractor/brightcove.py +++ b/hypervideo_dl/extractor/brightcove.py @@ -16,6 +16,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, extract_attributes, ExtractorError, find_xpath_attr, @@ -471,32 +472,22 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() - num_drm_sources = 0 formats, subtitles = [], {} sources = json_data.get('sources') or [] for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - skip_unplayable = not self.get_param('allow_unplayable_formats') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if skip_unplayable and (container == 'WVM' or source.get('key_systems')): - num_drm_sources += 1 - continue - elif ext == 'ism' and skip_unplayable: - continue - elif ext == 'm3u8' or container == 'M2TS': + if ext == 'm3u8' or container == 'M2TS': if not src: continue - f, subs = self._extract_m3u8_formats_and_subtitles( + fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(f) subtitles = self._merge_subtitles(subtitles, subs) elif ext == 'mpd': if not src: continue - f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) - formats.extend(f) + fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) subtitles = self._merge_subtitles(subtitles, subs) else: streaming_src = source.get('streaming_src') @@ -543,7 +534,13 @@ class BrightcoveNewIE(AdobePassIE): 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) - formats.append(f) + fmts = [f] + + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems') or ext == 'ism': + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) if not formats: errors = json_data.get('errors') @@ -551,9 +548,6 @@ class BrightcoveNewIE(AdobePassIE): error = errors[0] self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - elif (not self.get_param('allow_unplayable_formats') - and sources and num_drm_sources == len(sources)): - self.report_drm(video_id) self._sort_formats(formats) @@ -577,11 +571,19 @@ class BrightcoveNewIE(AdobePassIE): if duration is not None and duration <= 0: is_live = True + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(json_data.get('description')), - 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'thumbnails': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), diff --git a/hypervideo_dl/extractor/cableav.py b/hypervideo_dl/extractor/cableav.py new file mode 100644 index 0000000..77efdf4 --- /dev/null +++ b/hypervideo_dl/extractor/cableav.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class CableAVIE(InfoExtractor): + _VALID_URL = r'https://cableav\.tv/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://cableav.tv/lS4iR9lWjN8/', + 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', + 'info_dict': { + 'id': 'lS4iR9lWjN8', + 'ext': 'mp4', + 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', + 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py new file mode 100644 index 0000000..1f3b7cf --- /dev/null +++ b/hypervideo_dl/extractor/callin.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + float_or_none, + int_or_none +) + + +class CallinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P[-a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'info_dict': { + 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd', + 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'ext': 'ts', + 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'thumbnail': 're:https://.+\\.png', + 'description': 'First episode', + 'uploader': 'Wesley Yang', + 'timestamp': 1639404128.65, + 'upload_date': '20211213', + 'uploader_id': 'wesyang', + 'uploader_url': 'http://wesleyyang.substack.com', + 'channel': 'Conversations in Year Zero', + 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx', + 'duration': 9951.936, + 'view_count': int, + 'categories': ['News & Politics', 'History', 'Technology'], + 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'], + 'series': 'Conversations in Year Zero', + 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'episode_number': 1, + 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' + } + }] + + def try_get_user_name(self, d): + names = [d.get(n) for n in ('first', 'last')] + if None in names: + return next((n for n in names if n), default=None) + return ' '.join(names) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + next_data = self._search_nextjs_data(webpage, display_id) + episode = next_data['props']['pageProps']['episode'] + + id = episode['id'] + title = (episode.get('title') + or self._og_search_title(webpage, fatal=False) + or self._html_extract_title(webpage)) + url = episode['m3u8'] + formats = self._extract_m3u8_formats(url, display_id, ext='ts') + self._sort_formats(formats) + + show = traverse_obj(episode, ('show', 'title')) + show_id = traverse_obj(episode, ('show', 'id')) + + show_json = None + app_slug = (self._html_search_regex( + '[a-z0-9_]+)\.htm' + _TEST = { + 'url': 'https://cwwp2.dot.ca.gov/vm/loc/d3/hwy50at24th.htm', + 'info_dict': { + 'id': 'hwy50at24th', + 'ext': 'ts', + 'title': 'US-50 : Sacramento : Hwy 50 at 24th', + 'live_status': 'is_live', + 'thumbnail': 'https://cwwp2.dot.ca.gov/data/d3/cctv/image/hwy50at24th/hwy50at24th.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + global_vars = self._search_regex( + r'', + webpage, 'Global Vars') + route_place = self._search_regex(r'routePlace\s*=\s*"([^"]+)"', global_vars, 'Route Place', fatal=False) + location_name = self._search_regex(r'locationName\s*=\s*"([^"]+)"', global_vars, 'Location Name', fatal=False) + poster_url = self._search_regex(r'posterURL\s*=\s*"([^"]+)"', global_vars, 'Poster Url', fatal=False) + video_stream = self._search_regex(r'videoStreamURL\s*=\s*"([^"]+)"', global_vars, 'Video Stream URL', fatal=False) + + formats = self._extract_m3u8_formats(video_stream, video_id, 'ts', live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': f'{route_place} : {location_name}', + 'is_live': True, + 'formats': formats, + 'thumbnail': poster_url, + } diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py index 30daf2b..2a3931f 100644 --- a/hypervideo_dl/extractor/cam4.py +++ b/hypervideo_dl/extractor/cam4.py @@ -13,6 +13,8 @@ class CAM4IE(InfoExtractor): 'ext': 'mp4', 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'age_limit': 18, + 'live_status': 'is_live', + 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss', } } @@ -25,8 +27,9 @@ class CAM4IE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': channel_id, 'is_live': True, 'age_limit': 18, 'formats': formats, + 'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}', } diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py index eb2a8b4..3dc1937 100644 --- a/hypervideo_dl/extractor/cammodels.py +++ b/hypervideo_dl/extractor/cammodels.py @@ -91,7 +91,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, - 'title': self._live_title(user_id), + 'title': user_id, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/hypervideo_dl/extractor/canalalpha.py b/hypervideo_dl/extractor/canalalpha.py new file mode 100644 index 0000000..0365cb2 --- /dev/null +++ b/hypervideo_dl/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py index 49e7e4e..8b99037 100644 --- a/hypervideo_dl/extractor/canvas.py +++ b/hypervideo_dl/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,18 +60,23 @@ class CanvasIE(InfoExtractor): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) - if not data.get('title'): + if 'title' not in data: code = data.get('code') if code == 'AUTHENTICATION_REQUIRED': self.raise_login_required() @@ -78,7 +84,8 @@ class CanvasIE(InfoExtractor): self.raise_geo_restricted(countries=['BE']) raise ExtractorError(data.get('message') or code, expected=True) - title = data['title'] + # Note: The title may be an empty string + title = data['title'] or f'{site_id} {video_id}' description = data.get('description') formats = [] @@ -238,10 +245,6 @@ class VrtNUIE(GigyaBaseIE): 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint @@ -257,34 +260,20 @@ class VrtNUIE(GigyaBaseIE): 'episode_number': 5, }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) if auth_info.get('errorDetails'): raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) @@ -301,14 +290,15 @@ class VrtNUIE(GigyaBaseIE): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/hypervideo_dl/extractor/carambatv.py b/hypervideo_dl/extractor/carambatv.py index b57b86a..7e5cc90 100644 --- a/hypervideo_dl/extractor/carambatv.py +++ b/hypervideo_dl/extractor/carambatv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, float_or_none, int_or_none, try_get, @@ -43,7 +44,7 @@ class CarambaTVIE(InfoExtractor): formats = [{ 'url': base_url + f['fn'], 'height': int_or_none(f.get('height')), - 'format_id': '%sp' % f['height'] if f.get('height') else None, + 'format_id': format_field(f, 'height', '%sp'), } for f in video['qualities'] if f.get('fn')] self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index 2429521..4892419 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -2,17 +2,22 @@ from __future__ import unicode_literals import re +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( + int_or_none, + join_nonempty, js_to_json, - smuggle_url, - try_get, orderedSet, + smuggle_url, strip_or_none, + try_get, ExtractorError, ) @@ -122,9 +127,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -244,37 +249,129 @@ class CBCGemIE(InfoExtractor): 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + query = { + 'access_token': access_token, + 'apikey': self._TOKEN_API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] + + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token + + def _real_initialize(self): + if self.claims_token_valid(): + return + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: + return + + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json(self._API_BASE + video_id, video_id) - - last_error = None - attempt = -1 - retries = self.get_param('extractor_retries', 15) - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) - m3u8_info = self._download_json( - video_info['playSession']['url'], video_id, - note='Downloading JSON metadata%s' % f' (attempt {attempt})') - m3u8_url = m3u8_info.get('url') - if m3u8_url: - break - elif m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - else: - last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' - # 35 means media unavailable, but retries work - if m3u8_info.get('errorCode') != 35 or attempt >= retries: - raise ExtractorError(last_error) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + + email, password = self._get_login_info() + if email and password: + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} + else: + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') + + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) - for i, format in enumerate(formats): + for format in formats: if format.get('vcodec') == 'none': if format.get('ext') is None: format['ext'] = 'm4a' @@ -328,7 +425,8 @@ class CBCGemPlaylistIE(InfoExtractor): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}') @@ -377,7 +475,7 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P[0-9]{12})' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' _TEST = { 'url': 'https://gem.cbc.ca/live/920604739687', 'info_dict': { @@ -396,21 +494,21 @@ class CBCGemLiveIE(InfoExtractor): # It's unclear where the chars at the end come from, but they appear to be # constant. Might need updating in the future. - _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' + # There are two URLs, some livestreams are in one, and some + # in the other. The JSON schema is the same for both. + _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] def _real_extract(self, url): video_id = self._match_id(url) - live_info = self._download_json(self._API, video_id)['entries'] - video_info = None - for stream in live_info: - if stream.get('guid') == video_id: - video_info = stream - - if video_info is None: - raise ExtractorError( - 'Couldn\'t find video metadata, maybe this livestream is now offline', - expected=True) + for api_url in self._API_URLS: + video_info = next(( + stream for stream in self._download_json(api_url, video_id)['entries'] + if stream.get('guid') == video_id), None) + if video_info: + break + else: + raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) return { '_type': 'url_transparent', diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py index ae9ce58..2af36ea 100644 --- a/hypervideo_dl/extractor/cbs.py +++ b/hypervideo_dl/extractor/cbs.py @@ -77,21 +77,21 @@ class CBSIE(CBSBaseIE): (?: cbs:| https?://(?:www\.)?(?: - cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/| + cbs\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/| colbertlateshow\.com/(?:video|podcasts)/) )(?P[\w-]+)''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'url': 'https://www.cbs.com/shows/video/xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R/', 'info_dict': { - 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'id': 'xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R', 'ext': 'mp4', - 'title': 'Connect Chat feat. Garth Brooks', - 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - 'duration': 1495, - 'timestamp': 1385585425, - 'upload_date': '20131127', + 'title': 'Tough As Nails - Dreams Never Die', + 'description': 'md5:a3535a62531cdd52b0364248a2c1ae33', + 'duration': 2588, + 'timestamp': 1639015200, + 'upload_date': '20211209', 'uploader': 'CBSI-NEW', }, 'params': { @@ -99,14 +99,14 @@ class CBSIE(CBSBaseIE): 'skip_download': True, }, }, { - 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-', + 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { - 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2', - 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)', - 'timestamp': 1624507140, - 'description': 'md5:e01af24e95c74d55e8775aef86117b95', + 'id': 'sZH1MGgomIosZgxGJ1l263MFq16oMtW1', + 'title': 'The Late Show - 3/16/22 (Michael Buble, Rose Matafeo)', + 'timestamp': 1647488100, + 'description': 'md5:d0e6ec23c544b7fa8e39a8e6844d2439', 'uploader': 'CBSI-NEW', - 'upload_date': '20210624', + 'upload_date': '20220317', }, 'params': { 'ignore_no_formats_error': True, diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py index ea98f86..9dbaabf 100644 --- a/hypervideo_dl/extractor/ccma.py +++ b/hypervideo_dl/extractor/ccma.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .common import InfoExtractor from ..utils import ( clean_html, - extract_timezone, int_or_none, parse_duration, parse_resolution, try_get, + unified_timestamp, url_or_none, ) @@ -95,14 +92,8 @@ class CCMAIE(InfoExtractor): duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) tematica = try_get(informacio, lambda x: x['tematica']['text']) - timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) - try: - timezone, data_utc = extract_timezone(data_utc) - timestamp = calendar.timegm((datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) - except TypeError: - pass + timestamp = unified_timestamp(data_utc) subtitles = {} subtitols = media.get('subtitols') or [] diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py index 9b86121..0ed5f32 100644 --- a/hypervideo_dl/extractor/cctv.py +++ b/hypervideo_dl/extractor/cctv.py @@ -162,7 +162,8 @@ class CCTVIE(InfoExtractor): 'url': video_url, 'format_id': 'http', 'quality': quality, - 'source_preference': -10 + # Sample clip + 'preference': -10 }) hls_url = try_get(data, lambda x: x['hls_url'], compat_str) diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py index 5e04d38..ddf66b2 100644 --- a/hypervideo_dl/extractor/ceskatelevize.py +++ b/hypervideo_dl/extractor/ceskatelevize.py @@ -12,30 +12,15 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +51,60 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + next_data = self._search_nextjs_data(webpage, playlist_id) + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) + webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, + query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

    ' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +133,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +141,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +163,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -147,6 +177,7 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): + stream_url = stream_url.replace('https://', 'http://') if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', @@ -182,8 +213,6 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/hypervideo_dl/extractor/chaturbate.py b/hypervideo_dl/extractor/chaturbate.py index a459dcb..8da51f9 100644 --- a/hypervideo_dl/extractor/chaturbate.py +++ b/hypervideo_dl/extractor/chaturbate.py @@ -101,7 +101,7 @@ class ChaturbateIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(video_id), + 'title': video_id, 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py index 6bdc4f6..e6841fb 100644 --- a/hypervideo_dl/extractor/chingari.py +++ b/hypervideo_dl/extractor/chingari.py @@ -67,7 +67,7 @@ class ChingariBaseIE(InfoExtractor): class ChingariIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' _TESTS = [{ 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', 'info_dict': { @@ -102,7 +102,7 @@ class ChingariIE(ChingariBaseIE): class ChingariUserIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' _TESTS = [{ 'url': 'https://chingari.io/dada1023', 'playlist_mincount': 3, diff --git a/hypervideo_dl/extractor/closertotruth.py b/hypervideo_dl/extractor/closertotruth.py index 26243d5..517e121 100644 --- a/hypervideo_dl/extractor/closertotruth.py +++ b/hypervideo_dl/extractor/closertotruth.py @@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor): r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'(.+?)\s*\|\s*.+?', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)]+id="select-version"[^>]*>(.+?)', diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index df74c75..0035191 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import base64 -import datetime +import collections import hashlib import itertools import json @@ -45,15 +45,18 @@ from ..utils import ( determine_ext, determine_protocol, dict_get, + encode_data_uri, error_to_compat_str, extract_attributes, ExtractorError, + filter_dict, fix_xml_ampersands, float_or_none, format_field, GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -73,7 +76,9 @@ from ..utils import ( str_to_int, strip_or_none, traverse_obj, + try_get, unescapeHTML, + UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -134,6 +139,8 @@ class InfoExtractor(object): for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -161,9 +168,8 @@ class InfoExtractor(object): * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to @@ -179,6 +185,8 @@ class InfoExtractor(object): fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -209,7 +217,7 @@ class InfoExtractor(object): (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean * downloader_options A dictionary of downloader options as - described in FileDownloader + described in FileDownloader (For internal use only) RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -221,6 +229,7 @@ class InfoExtractor(object): The following fields are optional: + direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is @@ -235,16 +244,22 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) + * "http_headers" (dict) - HTTP headers for the request thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video was uploaded - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. + upload_date: Video upload date in UTC (YYYYMMDD). + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released in UTC. + If not explicitly set, calculated from release_timestamp + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified in UTC. + If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. @@ -252,6 +267,7 @@ class InfoExtractor(object): fields. This depends on a particular extractor. channel_id: Id of the channel. channel_url: Full URL to a channel webpage. + channel_follower_count: Number of followers of the channel. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -262,6 +278,8 @@ class InfoExtractor(object): * "url": A URL pointing to the subtitles file It can optionally also have: * "name": Name or description of the subtitles + * "http_headers": A dictionary of additional HTTP headers + to add to the request. "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated captions instead of normal subtitles @@ -340,6 +358,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -366,6 +385,7 @@ class InfoExtractor(object): disc_number: Number of the disc or other physical medium the track belongs to, as an integer. release_year: Year (YYYY) when the album was released. + composer: Composer of the piece Unless mentioned otherwise, the fields should be Unicode strings. @@ -379,6 +399,11 @@ class InfoExtractor(object): Additionally, playlists can have "id", "title", and any other relevent attributes with the same semantics as videos (see above). + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. @@ -404,13 +429,21 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. + Subclasses of this should define a _VALID_URL regexp and, re-define the + _real_extract() and (optionally) _real_initialize() methods. Probably, they should also be added to the list of extractors. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs - (except other extractors), so that lazy_extractors works correctly + (except other extractors), so that lazy_extractors works correctly. + + To support username + password (or netrc) login, the extractor must define a + _NETRC_MACHINE and re-define _perform_login(username, password) and + (optionally) _initialize_pre_login() methods. The _perform_login method will + be called between _initialize_pre_login and _real_initialize if credentials + are passed by the user. In cases where it is necessary to have the login + process as part of the extraction rather than initialization, _perform_login + can be left undefined. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -438,17 +471,21 @@ class InfoExtractor(object): _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _NETRC_MACHINE = None + IE_DESC = None _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" self._ready = False self._x_forwarded_for_ip = None self._printed_messages = set() @@ -460,6 +497,8 @@ class InfoExtractor(object): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: + if '_VALID_URL' not in cls.__dict__: + cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -486,6 +525,10 @@ class InfoExtractor(object): """Getter method for _WORKING.""" return cls._WORKING + @classmethod + def supports_login(cls): + return bool(cls._NETRC_MACHINE) + def initialize(self): """Initializes an instance (authentication, etc).""" self._printed_messages = set() @@ -494,6 +537,13 @@ class InfoExtractor(object): 'ip_blocks': self._GEO_IP_BLOCKS, }) if not self._ready: + self._initialize_pre_login() + if self.supports_login(): + username, password = self._get_login_info() + if username: + self._perform_login(username, password) + elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): + self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}') self._real_initialize() self._ready = True @@ -602,10 +652,19 @@ class InfoExtractor(object): if self.__maybe_fake_ip_and_retry(e.countries): continue raise + except UnsupportedError: + raise except ExtractorError as e: - video_id = e.video_id or self.get_temp_id(url) - raise ExtractorError( - e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + kwargs = { + 'video_id': e.video_id or self.get_temp_id(url), + 'ie': self.IE_NAME, + 'tb': e.traceback or sys.exc_info()[2], + 'expected': e.expected, + 'cause': e.cause + } + if hasattr(e, 'countries'): + kwargs['countries'] = e.countries + raise type(e)(e.orig_msg, **kwargs) except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: @@ -627,16 +686,24 @@ class InfoExtractor(object): return False def set_downloader(self, downloader): - """Sets the downloader for this IE.""" + """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + def _initialize_pre_login(self): + """ Intialization before login. Redefine in subclasses.""" + pass + + def _perform_login(self, username, password): + """ Login with username and password. Redefine in subclasses.""" + pass + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" - pass + raise NotImplementedError('This method must be implemented by subclasses') @classmethod def ie_key(cls): @@ -664,7 +731,7 @@ class InfoExtractor(object): See _download_webpage docstring for arguments specification. """ if not self._downloader._first_webpage_request: - sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: self.to_screen('Sleeping %s seconds ...' % sleep_interval) time.sleep(sleep_interval) @@ -715,7 +782,7 @@ class InfoExtractor(object): errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + raise ExtractorError(errmsg, cause=err) else: self.report_warning(errmsg) return False @@ -970,7 +1037,7 @@ class InfoExtractor(object): if transform_source: json_string = transform_source(json_string) try: - return json.loads(json_string) + return json.loads(json_string, strict=False) except ValueError as ve: errmsg = '%s: Failed to parse JSON ' % video_id if fatal: @@ -1063,23 +1130,30 @@ class InfoExtractor(object): def raise_login_required( self, msg='This video is only available for registered users', - metadata_available=False, method='any'): - if metadata_available and self.get_param('ignore_no_formats_error'): + metadata_available=False, method=NO_DEFAULT): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) + return + if method is NO_DEFAULT: + method = 'any' if self.supports_login() else 'cookies' if method is not None: + assert method in self._LOGIN_HINTS, 'Invalid login method' msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) raise ExtractorError(msg, expected=True) def raise_geo_restricted( self, msg='This video is not available from your location due to geo restriction', countries=None, metadata_available=False): - if metadata_available and self.get_param('ignore_no_formats_error'): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) else: raise GeoRestrictedError(msg, countries=countries) def raise_no_formats(self, msg, expected=False, video_id=None): - if expected and self.get_param('ignore_no_formats_error'): + if expected and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg, video_id) elif isinstance(msg, ExtractorError): raise msg @@ -1088,39 +1162,39 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None, **kwargs): + def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs): """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - video_info.update(kwargs) + if ie is not None: + kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key() if video_id is not None: - video_info['id'] = video_id + kwargs['id'] = video_id if video_title is not None: - video_info['title'] = video_title - return video_info + kwargs['title'] = video_title + return { + **kwargs, + '_type': 'url_transparent' if url_transparent else 'url', + 'url': url, + } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): + urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) + for m in orderedSet(map(getter, matches) if getter else matches)) + return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - video_info.update(kwargs) if playlist_id: - video_info['id'] = playlist_id + kwargs['id'] = playlist_id if playlist_title: - video_info['title'] = playlist_title + kwargs['title'] = playlist_title if playlist_description is not None: - video_info['description'] = playlist_description - return video_info + kwargs['description'] = playlist_description + return { + **kwargs, + '_type': 'multi_video' if multi_video else 'playlist', + 'entries': entries, + } def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ @@ -1137,7 +1211,7 @@ class InfoExtractor(object): if mobj: break - _name = self._downloader._color_text(name, 'blue') + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) if mobj: if group is None: @@ -1225,8 +1299,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -1257,8 +1331,8 @@ class InfoExtractor(object): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1269,6 +1343,9 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)([^<]+)', html, name, fatal=fatal, **kwargs) + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) if display_name is None: @@ -1409,6 +1486,23 @@ class InfoExtractor(object): continue info[count_key] = interaction_count + def extract_chapter_information(e): + chapters = [{ + 'title': part.get('name'), + 'start_time': part.get('startOffset'), + 'end_time': part.get('endOffset'), + } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip'] + for idx, (last_c, current_c, next_c) in enumerate(zip( + [{'end_time': 0}] + chapters, chapters, chapters[1:])): + current_c['end_time'] = current_c['end_time'] or next_c['start_time'] + current_c['start_time'] = current_c['start_time'] or last_c['end_time'] + if None in current_c.values(): + self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters') + return + if chapters: + chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration'] + info['chapters'] = chapters + def extract_video_object(e): assert e['@type'] == 'VideoObject' author = e.get('author') @@ -1416,7 +1510,8 @@ class InfoExtractor(object): 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), + 'thumbnails': [{'url': url_or_none(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. @@ -1431,12 +1526,21 @@ class InfoExtractor(object): 'view_count': int_or_none(e.get('interactionCount')), }) extract_interaction_statistic(e) + extract_chapter_information(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1466,8 +1570,10 @@ class InfoExtractor(object): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) + if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + extract_video_object(e['video'][0]) elif item_type == 'VideoObject': extract_video_object(e) if expected_type is None: @@ -1481,7 +1587,34 @@ class InfoExtractor(object): continue else: break - return dict((k, v) for k, v in info.items() if v is not None) + traverse_json_ld(json_ld) + + return filter_dict(info) + + def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' + # not all website do this, but it can be changed + # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + rectx = re.escape(context_name) + js, arg_keys, arg_vals = self._search_regex( + (r'' % rectx, + r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), + webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] @staticmethod def _hidden_inputs(html): @@ -1510,20 +1643,20 @@ class InfoExtractor(object): default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases + 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'format_id') + 'fps', 'fs_approx', 'source', 'id') settings = { 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']}, + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, @@ -1537,8 +1670,8 @@ class InfoExtractor(object): 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'ignore', 'field': 'language_preference'}, - 'quality': {'convert': 'float_none', 'default': -1}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, 'id': {'convert': 'string', 'field': 'format_id'}, @@ -1549,7 +1682,7 @@ class InfoExtractor(object): 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'ignore', 'field': 'source_preference'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, @@ -1558,39 +1691,51 @@ class InfoExtractor(object): 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - # Most of these exist only for compatibility reasons - 'dimension': {'type': 'alias', 'field': 'res'}, - 'resolution': {'type': 'alias', 'field': 'res'}, - 'extension': {'type': 'alias', 'field': 'ext'}, - 'bitrate': {'type': 'alias', 'field': 'br'}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr'}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, - 'framerate': {'type': 'alias', 'field': 'fps'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists - 'protocol': {'type': 'alias', 'field': 'proto'}, + # For compatibility with youtube-dl + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - 'filesize_estimate': {'type': 'alias', 'field': 'size'}, - 'samplerate': {'type': 'alias', 'field': 'asr'}, - 'video_ext': {'type': 'alias', 'field': 'vext'}, - 'audio_ext': {'type': 'alias', 'field': 'aext'}, - 'video_codec': {'type': 'alias', 'field': 'vcodec'}, - 'audio_codec': {'type': 'alias', 'field': 'acodec'}, - 'video': {'type': 'alias', 'field': 'hasvid'}, - 'has_video': {'type': 'alias', 'field': 'hasvid'}, - 'audio': {'type': 'alias', 'field': 'hasaud'}, - 'has_audio': {'type': 'alias', 'field': 'hasaud'}, - 'extractor': {'type': 'alias', 'field': 'ie_pref'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, - 'format_id': {'type': 'alias', 'field': 'id'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, } - _order = [] + def __init__(self, ie, field_preference): + self._order = [] + self.ydl = ie._downloader + self.evaluate_params(self.ydl.params, field_preference) + if ie.get_param('verbose'): + self.print_verbose_info(self.ydl.write_debug) def _get_field_setting(self, field, key): if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecation_warning( + f'Using arbitrary fields ({field}) for format sorting is deprecated ' + 'and may be removed in a future version') self.settings[field] = {} propObj = self.settings[field] if key not in propObj: @@ -1673,7 +1818,11 @@ class InfoExtractor(object): if field is None: continue if self._get_field_setting(field, 'type') == 'alias': - field = self._get_field_setting(field, 'field') + alias, field = field, self._get_field_setting(field, 'field') + if self._get_field_setting(alias, 'deprecated'): + self.ydl.deprecation_warning( + f'Format sorting alias {alias} is deprecated ' + f'and may be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') @@ -1777,10 +1926,7 @@ class InfoExtractor(object): def _sort_formats(self, formats, field_preference=[]): if not formats: return - format_sort = self.FormatSort() # params and to_screen are taken from the downloader - format_sort.evaluate_params(self._downloader.params, field_preference) - if self.get_param('verbose', False): - format_sort.print_verbose_info(self._downloader.write_debug) + format_sort = self.FormatSort(self, field_preference) formats.sort(key=lambda f: format_sort.calculate_preference(f)) def _check_formats(self, formats, video_id): @@ -1899,7 +2045,7 @@ class InfoExtractor(object): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1961,7 +2107,7 @@ class InfoExtractor(object): def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -2008,16 +2154,16 @@ class InfoExtractor(object): headers=headers, query=query, video_id=video_id) def _parse_m3u8_formats_and_subtitles( - self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native', + self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native', preference=None, quality=None, m3u8_id=None, live=False, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2056,9 +2202,9 @@ class InfoExtractor(object): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, - 'url': m3u8_url, + 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'), 'ext': ext, 'protocol': entry_protocol, 'preference': preference, @@ -2105,7 +2251,7 @@ class InfoExtractor(object): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2162,9 +2308,9 @@ class InfoExtractor(object): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2264,7 +2410,7 @@ class InfoExtractor(object): if smil is False: assert not fatal - return [] + return [], {} namespace = self._parse_smil_namespace(smil) @@ -2628,7 +2774,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2644,11 +2790,15 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = representation_attrib.get('codecs', '') + codecs = parse_codecs(representation_attrib.get('codecs', '')) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs.split('.')[0] == 'stpp': + elif codecs['vcodec'] != 'none': + content_type = 'video' + elif codecs['acodec'] != 'none': + content_type = 'audio' + elif codecs.get('tcodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2694,10 +2844,8 @@ class InfoExtractor(object): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] + **codecs } - f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2770,7 +2918,8 @@ class InfoExtractor(object): segment_duration = None if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['total_number'] = int(math.ceil( + float_or_none(period_duration, segment_duration, default=0))) representation_ms_info['fragments'] = [{ media_location_key: media_template % { 'Number': segment_number, @@ -2861,10 +3010,16 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) + if not period_duration: + period_duration = try_get( + representation_ms_info, + lambda r: sum(frag['duration'] for frag in r['fragments']), float) else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2953,13 +3108,6 @@ class InfoExtractor(object): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2978,7 +3126,7 @@ class InfoExtractor(object): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', @@ -3008,7 +3156,7 @@ class InfoExtractor(object): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): return urljoin(base_url, item_url) @@ -3402,15 +3550,11 @@ class InfoExtractor(object): return formats def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str + self._downloader.deprecation_warning('hypervideo_dl.InfoExtractor._live_title is deprecated and does not work as expected') + return name def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) if res is None: msg = 'Failed to extract %s: Could not parse value %r' % (name, v) if fatal: @@ -3515,14 +3659,18 @@ class InfoExtractor(object): def extractor(): comments = [] + interrupted = True try: while True: comments.append(next(generator)) - except KeyboardInterrupt: - interrupted = True - self.to_screen('Interrupted by user') except StopIteration: interrupted = False + except KeyboardInterrupt: + self.to_screen('Interrupted by user') + except Exception as e: + if self.get_param('ignoreerrors') is not True: + raise + self._downloader.report_error(e) comment_count = len(comments) self.to_screen(f'Extracted {comment_count} comments') return { @@ -3536,11 +3684,11 @@ class InfoExtractor(object): @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs + """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) + list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @classmethod @@ -3565,9 +3713,8 @@ class InfoExtractor(object): def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self._get_login_info()[0] is not None - or self.get_param('cookiefile') - or self.get_param('cookiesfrombrowser')): + if (self.supports_login() and self._get_login_info()[0] is not None + or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3600,7 +3747,7 @@ class InfoExtractor(object): else 'public' if all_known else None) - def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False): ''' @returns A list of values for the extractor argument given by "key" or "default" if no such key is present @@ -3608,34 +3755,43 @@ class InfoExtractor(object): @param casesense When false, the values are converted to lower case ''' val = traverse_obj( - self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] + def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'): + if not playlist_id or not video_id: + return not video_id + + no_playlist = (smuggled_data or {}).get('force_noplaylist') + if no_playlist is not None: + return not no_playlist + + video_id = '' if video_id is True else f' {video_id}' + playlist_id = '' if playlist_id is True else f' {playlist_id}' + if self.get_param('noplaylist'): + self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist') + return False + self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') + return True + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. + Instances should define _SEARCH_KEY and optionally _MAX_RESULTS """ + _MAX_RESULTS = float('inf') + @classmethod def _make_valid_url(cls): return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') + prefix, query = self._match_valid_url(query).group('prefix', 'query') if prefix == '': return self._get_n_results(query, 1) elif prefix == 'all': diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 352951e..1194613 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index eba6b73..e90aa19 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -57,7 +57,7 @@ class CoubIE(InfoExtractor): file_versions = coub['file_versions'] - QUALITIES = ('low', 'med', 'high') + QUALITIES = ('low', 'med', 'high', 'higher') MOBILE = 'mobile' IPHONE = 'iphone' @@ -86,6 +86,7 @@ class CoubIE(InfoExtractor): 'format_id': '%s-%s-%s' % (HTML5, kind, quality), 'filesize': int_or_none(item.get('size')), 'vcodec': 'none' if kind == 'audio' else None, + 'acodec': 'none' if kind == 'video' else None, 'quality': quality_key(quality), 'source_preference': preference_key(HTML5), }) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py new file mode 100644 index 0000000..d49f1ca --- /dev/null +++ b/hypervideo_dl/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cozy\.tv/(?P[^/]+)/replays/(?P[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py new file mode 100644 index 0000000..2274115 --- /dev/null +++ b/hypervideo_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?Pl-)?episode\?id=(?P[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?Pemission|rechercher))\?(?:[^&]+&)*?(?P(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index 2c9d28d..db4962c 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -23,32 +23,35 @@ from ..utils import ( class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', + # Crackle is available in the United States and territories + 'url': 'https://www.crackle.com/thanksgiving/2510064', 'info_dict': { - 'id': '2502343', + 'id': '2510064', 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, + 'title': 'Touch Football', + 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df', + 'duration': 1398, 'view_count': int, 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', + 'age_limit': 17, + 'genre': 'Comedy', + 'creator': 'Daniel Powell', + 'artist': 'Chris Elliott, Amy Sedaris', + 'release_year': 2016, + 'series': 'Thanksgiving', + 'episode': 'Touch Football', 'season_number': 1, 'episode_number': 1, }, 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': [ + 'Trying with a list of known countries' + ], }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', 'only_matching': True, }] @@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor): break ignore_no_formats = self.get_param('ignore_no_formats_error') - allow_unplayable_formats = self.get_param('allow_unplayable_formats') if not media or (not media.get('MediaURLs') and not ignore_no_formats): raise ExtractorError( @@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor): for e in media.get('MediaURLs') or []: if e.get('UseDRM'): has_drm = True - if not allow_unplayable_formats: - continue - format_url = url_or_none(e.get('Path')) + format_url = url_or_none(e.get('DRMPath')) + else: + format_url = url_or_none(e.get('Path')) if not format_url: continue ext = determine_ext(format_url) diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py new file mode 100644 index 0000000..ed2f442 --- /dev/null +++ b/hypervideo_dl/extractor/craftsy.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + dict_get, + get_element_by_id, + js_to_json, + traverse_obj, +) + + +class CraftsyIE(InfoExtractor): + _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _TESTS = [{ + 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', + 'info_dict': { + 'id': 'the-midnight-quilt-show-season-5', + 'title': 'The Midnight Quilt Show Season 5', + 'description': 'md5:113eda818e985d1a566625fb2f833b7a', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/', + 'info_dict': { + 'id': 'sew-your-own-designer-handbag', + 'title': 'Sew Your Own Designer Handbag', + 'description': 'md5:8270d0ef5427d3c895a27351aeaac276', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/', + 'info_dict': { + 'id': 'all-access-estes-park-wool-market', + 'title': 'All Access: Estes Park Wool Market', + 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'class_video_player_vars\s*=\s*({.*})\s*;', + get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), + 'video data'), video_id, transform_source=js_to_json) + + account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + + entries = [] + class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) + if class_preview: + v_id = class_preview.get('video_id') + entries.append(self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', + BrightcoveNewIE, v_id, class_preview.get('title'))) + + if dict_get(video_data, ('is_free', 'user_has_access')): + entries += [ + self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', + BrightcoveNewIE, lesson['video_id'], lesson.get('title')) + for lesson in video_data['lessons']] + + return self.playlist_result( + entries, video_id, video_data.get('class_title'), + self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py new file mode 100644 index 0000000..72906af --- /dev/null +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_strdate, +) + + +class CrowdBunkerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I', + 'info_dict': { + 'id': '0z4Kms8pi8I', + 'ext': 'mp4', + 'title': '117) Pass vax et solutions', + 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c', + 'view_count': int, + 'duration': 5386, + 'uploader': 'Jérémie Mercier', + 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ', + 'like_count': int, + 'upload_date': '20211218', + 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.divulg.org/post/{id}/details', + id, headers={'accept': 'application/json, text/plain, */*'}) + video_json = data_json['video'] + formats, subtitles = [], {} + for sub in video_json.get('captions') or []: + sub_url = try_get(sub, lambda x: x['file']['url']) + if not sub_url: + continue + subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({ + 'url': sub_url, + }) + + mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) + if mpd_url: + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + thumbnails = [{ + 'url': image['url'], + 'height': int_or_none(image.get('height')), + 'width': int_or_none(image.get('width')), + } for image in video_json.get('thumbnails') or [] if image.get('url')] + + self._sort_formats(formats) + return { + 'id': id, + 'title': video_json.get('title'), + 'description': video_json.get('description'), + 'view_count': video_json.get('viewCount'), + 'duration': video_json.get('duration'), + 'uploader': try_get(data_json, lambda x: x['channel']['name']), + 'uploader_id': try_get(data_json, lambda x: x['channel']['id']), + 'like_count': data_json.get('likesCount'), + 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + +class CrowdBunkerChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/@Milan_UHRIN', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'Milan_UHRIN', + }, + }] + + def _entries(self, id): + last = None + + for page in itertools.count(): + channel_json = self._download_json( + f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'}, + query={'after': last} if last else {}, note=f'Downloading Page {page}') + for item in channel_json.get('items') or []: + v_id = item.get('uid') + if not v_id: + continue + yield self.url_result( + 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id) + last = channel_json.get('last') + if not last: + break + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 511ac1b..7edb645 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -8,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -23,14 +24,17 @@ from ..utils import ( bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, + traverse_obj, try_get, - urlencode_postdata, xpath_text, ) from ..aes import ( @@ -39,8 +43,8 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/login' - _LOGIN_FORM = 'login_form' + _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' def _call_rpc_api(self, method, video_id, note=None, data=None): @@ -53,57 +57,50 @@ class CrunchyrollBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get('etp_rt'): return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - def is_logged(webpage): - return 'href="/logout"' in webpage - - # Already logged in - if is_logged(login_page): - return - - login_form_str = self._search_regex( - r'(?P
    ]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, - login_page, 'login form', group='form') - - post_url = extract_attributes(login_form_str).get('action') - if not post_url: - post_url = self._LOGIN_URL - elif not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) - - login_form.update({ - 'login_form[name]': username, - 'login_form[password]': password, - }) - - response = self._download_webpage( - post_url, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if is_logged(response): - return - - error = self._html_search_regex( - '(?s)]+class=["\']messages["\'][^>]*>(.+?)', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() + upsell_response = self._download_json( + f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', + query={ + 'sess_id': 1, + 'device_id': 'whatvalueshouldbeforweb', + 'device_type': 'com.crunchyroll.static', + 'access_token': 'giKq5eY27ny3cqz', + 'referer': self._LOGIN_URL + }) + if upsell_response['code'] != 'ok': + raise ExtractorError('Could not get session id') + session_id = upsell_response['data']['session_id'] + + login_response = self._download_json( + f'{self._API_BASE}/login.1.json', None, 'Logging in', + data=compat_urllib_parse_urlencode({ + 'account': username, + 'password': password, + 'session_id': session_id + }).encode('ascii')) + if login_response['code'] != 'ok': + raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Login succeeded but did not set etp_rt cookie') + + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) @staticmethod def _add_skip_wall(url): @@ -119,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ @@ -425,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'
    ', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'
    (.+?)
    ', webpage, 'trailer-notice', default='') @@ -478,19 +477,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r']+href="/publisher/[^"]+"[^>]*>([^<]+)
    ', r'
    \s*Publisher:\s*\s*(.+?)\s*\s*
    '], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -684,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'
    ', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'
  • ]+>.*?(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -719,26 +772,129 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } + + +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/\w+/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -746,12 +902,57 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index 2e01aff..f51159b 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTMLParseError from ..utils import ( determine_ext, ExtractorError, @@ -11,14 +12,16 @@ from ..utils import ( get_element_by_attribute, get_element_by_class, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_iso8601, + parse_qs, smuggle_url, str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE @@ -126,8 +129,12 @@ class CSpanIE(InfoExtractor): ext = 'vtt' subtitle['ext'] = ext ld_info = self._search_json_ld(webpage, video_id, default={}) - title = get_element_by_class('video-page-title', webpage) or \ - self._og_search_title(webpage) + try: + title = get_element_by_class('video-page-title', webpage) + except compat_HTMLParseError: + title = None + if title is None: + title = self._og_search_title(webpage) description = get_element_by_attribute('itemprop', 'description', webpage) or \ self._html_search_meta(['og:description', 'description'], webpage) return merge_dicts(info, ld_info, { @@ -242,3 +249,42 @@ class CSpanIE(InfoExtractor): 'title': title, 'id': 'c' + video_id if video_type == 'clip' else video_id, } + + +class CSpanCongressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/' + _TESTS = [{ + 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380', + 'info_dict': { + 'id': 'house_2017-12-13', + 'title': 'Congressional Chronicle - Members of Congress, Hearings and More', + 'description': 'md5:54c264b7a8f219937987610243305a84', + 'thumbnail': r're:https://ximage.c-spanvideo.org/.+', + 'ext': 'mp4' + } + }] + + def _real_extract(self, url): + query = parse_qs(url) + video_date = query.get('date', [None])[0] + video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_') + webpage = self._download_webpage(url, video_id) + if not video_date: + jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P\d{4}-\d{2}-\d{2})\';', webpage) + if jwp_date: + video_id = f'{video_id}_{jwp_date.group("date")}' + jwplayer_data = self._parse_json( + self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), + video_id, transform_source=js_to_json) + + title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')) + description = (self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, 'description', default=None)) + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id, False), + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': description, + 'http_headers': {'Referer': 'https://www.c-span.org/'}, + } diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 03f8cef..952f4c7 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -65,4 +65,9 @@ class CTVNewsIE(InfoExtractor): }) entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + if not entries: + webpage = self._download_webpage(url, page_id) + if 'getAuthStates("' in webpage: + entries = [ninecninemedia_url_result(clip_id) for clip_id in + self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')] return self.playlist_result(entries, page_id) diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index 034a5c9..b8abcf7 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -15,7 +15,6 @@ from ..utils import ( class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -34,43 +33,46 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) return result['data'] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ - 'email': email, + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ + 'email': username, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +142,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +179,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +197,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py new file mode 100644 index 0000000..c278f0f --- /dev/null +++ b/hypervideo_dl/extractor/cybrary.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + smuggle_url, + str_or_none, + traverse_obj, + urlencode_postdata +) + + +class CybraryBaseIE(InfoExtractor): + _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw' + _ENDPOINTS = { + 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}', + 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment', + 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}', + 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch', + 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}', + } + _NETRC_MACHINE = 'cybrary' + _TOKEN = None + + def _perform_login(self, username, password): + CybraryBaseIE._TOKEN = self._download_json( + f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}', + None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}), + note='Logging in')['idToken'] + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(method='password') + + def _call_api(self, endpoint, item_id): + return self._download_json( + self._ENDPOINTS[endpoint].format(item_id), item_id, + note=f'Downloading {endpoint} JSON metadata', + headers={'Authorization': f'Bearer {self._TOKEN}'}) + + def _get_vimeo_id(self, activity_id): + launch_api = self._call_api('launch', activity_id) + + if launch_api.get('url'): + return self._search_regex(r'https?://player\.vimeo\.com/video/(?P[0-9]+)', launch_api['url'], 'vimeo_id') + return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False) + + +class CybraryIE(CybraryBaseIE): + _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', + 'md5': '9ae12d37e555cb2ed554223a71a701d0', + 'info_dict': { + 'id': '646609770', + 'ext': 'mp4', + 'title': 'Getting Started', + 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280', + 'series_id': '63111', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 88, + 'uploader_id': 'user30867300', + 'series': 'Cybrary Orientation', + 'uploader': 'Cybrary', + 'chapter': 'Cybrary Orientation Series', + 'chapter_id': '63110' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }, { + 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', + 'md5': '62f26547dccc59c44363e2a13d4ad08d', + 'info_dict': { + 'id': '445638073', + 'ext': 'mp4', + 'title': 'Azure Virtual Network IP Addressing', + 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280', + 'series_id': '52733', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 426, + 'uploader_id': 'user30867300', + 'series': 'AZ-500: Microsoft Azure Security Technologies', + 'uploader': 'Cybrary', + 'chapter': 'Implement Network Security', + 'chapter_id': '52693' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }] + + def _real_extract(self, url): + activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment') + course = self._call_api('enrollment', enrollment_id)['content'] + activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False) + + if activity.get('type') not in ['Video Activity', 'Lesson Activity']: + raise ExtractorError('The activity is not a video', expected=True) + + module = next((m for m in course.get('learning_modules') or [] + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + + vimeo_id = self._get_vimeo_id(activity_id) + + return { + '_type': 'url_transparent', + 'series': traverse_obj(course, ('content_description', 'title')), + 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))), + 'id': vimeo_id, + 'chapter': module.get('title'), + 'chapter_id': str_or_none(module.get('id')), + 'title': activity.get('title'), + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + } + + +class CybraryCourseIE(CybraryBaseIE): + _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', + 'info_dict': { + 'id': 898, + 'title': 'AZ-500: Microsoft Azure Security Technologies', + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + }, + 'playlist_count': 59 + }, { + 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', + 'info_dict': { + 'id': 1245, + 'title': 'Cybrary Orientation', + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + course = self._call_api('course', course_id) + enrollment_info = self._call_api('course_enrollment', course['id']) + + entries = [self.url_result( + f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}') + for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))] + + return self.playlist_result( + entries, + traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none), + course.get('title'), course.get('short_description')) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py new file mode 100644 index 0000000..6037fd9 --- /dev/null +++ b/hypervideo_dl/extractor/daftsex.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + parse_count, + parse_duration, + traverse_obj, + try_get, + unified_timestamp, +) + + +class DaftsexIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' + _TESTS = [{ + 'url': 'https://daftsex.com/watch/-35370899_456246186', + 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'info_dict': { + 'id': '-35370899_456246186', + 'ext': 'mp4', + 'title': 'just relaxing', + 'description': 'just relaxing - Watch video Watch video in high quality', + 'upload_date': '20201113', + 'timestamp': 1605261911, + 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + }, + }, { + 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'info_dict': { + 'id': '-156601359_456242791', + 'ext': 'mp4', + 'title': 'Skye Blue - Dinner And A Show', + 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', + 'upload_date': '20200916', + 'timestamp': 1600250735, + 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('name', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + duration = parse_duration(self._search_regex( + r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', + webpage, 'duration', fatal=False)) + views = parse_count(self._search_regex( + r'Views: ([0-9 ]+)', + webpage, 'views', fatal=False)) + + player_hash = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}', + webpage, 'player hash') + player_color = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}', + webpage, 'player color', fatal=False) or '' + + embed_page = self._download_webpage( + 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + video_id, headers={'Referer': url}) + video_params = self._parse_json( + self._search_regex( + r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>', + embed_page, 'video parameters'), + video_id, transform_source=js_to_json) + + server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') + + cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} + if cdn_files: + formats = [] + for format_id, format_data in cdn_files.items(): + ext, height = format_id.split('_') + formats.append({ + 'format_id': format_id, + 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration, + 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } + + item = self._download_json( + f'{server_domain}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': video_params['video']['access_token'], + 'videos': video_id, + 'ckey': video_params['c_key'], + 'credentials': video_params['video']['credentials'], + })['response']['items'][0] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index e04e10b..9cb5618 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P[^/?_]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? + [/=](?P[^/?_&]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? ''' IE_NAME = 'dailymotion' _TESTS = [{ @@ -115,6 +115,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader_id': 'x1xm8ri', 'age_limit': 0, }, + }, { + 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', + 'md5': 'e2f9717c6604773f963f069ca53a07f8', + 'info_dict': { + 'id': 'x89eyek', + 'ext': 'mp4', + 'title': "En quête d'esprit du 27/03/2022", + 'description': 'md5:66542b9f4df2eb23f314fc097488e553', + 'duration': 2756, + 'timestamp': 1648383669, + 'upload_date': '20220327', + 'uploader': 'CNEWS', + 'uploader_id': 'x24vth', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['en_quete_d_esprit'], + 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + } }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', @@ -207,12 +226,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: - if not self.get_param('noplaylist'): - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + if self._yes_playlist(playlist_id, video_id): return self.url_result( 'http://www.dailymotion.com/playlist/' + playlist_id, 'DailymotionPlaylist', playlist_id) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) password = self.get_param('videopassword') media = self._call_api( @@ -261,9 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue if media_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) else: f = { 'url': media_url, @@ -305,7 +320,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(media.get('description')), 'thumbnails': thumbnails, 'duration': int_or_none(metadata.get('duration')) or None, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 8aa2af9..4362e92 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -157,11 +157,8 @@ class DaumListIE(InfoExtractor): query_dict = parse_qs(url) if 'clipid' in query_dict: clip_id = query_dict['clipid'][0] - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) + if not self._yes_playlist(list_id, clip_id): return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) class DaumPlaylistIE(DaumListIE): diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py new file mode 100644 index 0000000..4f59d90 --- /dev/null +++ b/hypervideo_dl/extractor/daystar.py @@ -0,0 +1,48 @@ +from .common import InfoExtractor +from ..utils import js_to_json, urljoin + + +class DaystarClipIE(InfoExtractor): + IE_NAME = 'daystar:clip' + _VALID_URL = r'https?://player\.daystar\.tv/(?P\w+)' + _TESTS = [{ + 'url': 'https://player.daystar.tv/0MTO2ITM', + 'info_dict': { + 'id': '0MTO2ITM', + 'ext': 'mp4', + 'title': 'The Dark World of COVID Pt. 1 | Aaron Siri', + 'description': 'a420d320dda734e5f29458df3606c5f4', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + src_iframe = self._search_regex(r'\]+src="([^"]+)"', webpage, 'src iframe') + webpage_iframe = self._download_webpage( + src_iframe.replace('player.php', 'config2.php'), video_id, headers={'Referer': src_iframe}) + + sources = self._parse_json(self._search_regex( + r'sources\:\s*(\[.*?\])', webpage_iframe, 'm3u8 source'), video_id, transform_source=js_to_json) + + formats, subtitles = [], {} + for source in sources: + file = source.get('file') + if file and source.get('type') == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + urljoin('https://www.lightcast.com/embed/', file), + video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._search_regex(r'image:\s*"([^"]+)', webpage_iframe, 'thumbnail'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py new file mode 100644 index 0000000..8398ae3 --- /dev/null +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -0,0 +1,141 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + parse_resolution, + traverse_obj, + try_get, + urlencode_postdata, +) + + +class DigitalConcertHallIE(InfoExtractor): + IE_DESC = 'DigitalConcertHall extractor' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' + _ACCESS_TOKEN = None + _NETRC_MACHINE = 'digitalconcerthall' + _TESTS = [{ + 'note': 'Playlist with only one video', + 'url': 'https://www.digitalconcerthall.com/en/concert/53201', + 'info_dict': { + 'id': '53201-1', + 'ext': 'mp4', + 'composer': 'Kurt Weill', + 'title': '[Magic Night]', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20210624', + 'timestamp': 1624548600, + 'duration': 2798, + 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Concert with several works and an interview', + 'url': 'https://www.digitalconcerthall.com/en/concert/53785', + 'info_dict': { + 'id': '53785', + 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko', + 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich', + }, + 'params': {'skip_download': 'm3u8'}, + 'playlist_count': 3, + }] + + def _perform_login(self, username, password): + token_response = self._download_json( + self._OAUTH_URL, + None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ + 'affiliate': 'none', + 'grant_type': 'device', + 'device_vendor': 'unknown', + 'app_id': 'dch.webapp', + 'app_version': '1.0.0', + 'client_secret': '2ySLN+2Fwb', + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + self._ACCESS_TOKEN = token_response['access_token'] + try: + self._download_json( + self._OAUTH_URL, + None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://www.digitalconcerthall.com', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}' + }) + except ExtractorError: + self.raise_login_required(msg='Login info incorrect') + + def _real_initialize(self): + if not self._ACCESS_TOKEN: + self.raise_login_required(method='password') + + def _entries(self, items, language, **kwargs): + for item in items: + video_id = item['id'] + stream_info = self._download_json( + self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ + 'Accept': 'application/json', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + 'Accept-Language': language + }) + + m3u8_url = traverse_obj( + stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) + self._sort_formats(formats) + + yield { + 'id': video_id, + 'title': item.get('title'), + 'composer': item.get('name_composer'), + 'url': m3u8_url, + 'formats': formats, + 'duration': item.get('duration_total'), + 'timestamp': traverse_obj(item, ('date', 'published')), + 'description': item.get('short_description') or stream_info.get('short_description'), + **kwargs, + 'chapters': [{ + 'start_time': chapter.get('time'), + 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), + 'title': chapter.get('text'), + } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } + + def _real_extract(self, url): + language, video_id = self._match_valid_url(url).group('language', 'id') + if not language: + language = 'en' + + thumbnail_url = self._html_search_regex( + r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)', + self._download_webpage(url, video_id), 'thumbnail') + thumbnails = [{ + 'url': thumbnail_url, + **parse_resolution(thumbnail_url) + }] + + vid_info = self._download_json( + f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + 'Accept': 'application/json', + 'Accept-Language': language + }) + album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + + return { + '_type': 'playlist', + 'id': video_id, + 'title': vid_info.get('title'), + 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, + thumbnails=thumbnails, album_artist=album_artist), + 'thumbnails': thumbnails, + 'album_artist': album_artist, + } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index f018cbe..0ad7b1f 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index be7ad12..3d651f3 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -74,13 +74,11 @@ class DigitallySpeakingIE(InfoExtractor): tbr = int_or_none(bitrate) vbr = int_or_none(self._search_regex( r'-(\d+)\.mp4', video_path, 'vbr', default=None)) - abr = tbr - vbr if tbr and vbr else None video_formats.append({ 'format_id': bitrate, 'url': url, 'tbr': tbr, 'vbr': vbr, - 'abr': abr, }) return video_formats @@ -121,6 +119,7 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) + self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 90462c0..7410eb6 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -84,7 +84,7 @@ class DLiveStreamIE(InfoExtractor): self._sort_formats(formats) return { 'id': display_name, - 'title': self._live_title(title), + 'title': title, 'uploader': display_name, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py index 2c9ea68..f692127 100644 --- a/hypervideo_dl/extractor/doodstream.py +++ b/hypervideo_dl/extractor/doodstream.py @@ -20,6 +20,16 @@ class DoodStreamIE(InfoExtractor): 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', } + }, { + 'url': 'http://dood.watch/d/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } }, { 'url': 'https://dood.to/d/jzrxn12t2s7n', 'md5': '3207e199426eca7c2aa23c2872e6728a', @@ -34,31 +44,26 @@ class DoodStreamIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + url = f'https://dood.to/e/{video_id}' webpage = self._download_webpage(url, video_id) - if '/d/' in url: - url = "https://dood.to" + self._html_search_regex( - r'