From 1e5a50b71d8f0eae6007bedc329eecb24bb5aba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jes=C3=BAs?= Date: Wed, 6 Apr 2022 03:37:17 +0800 Subject: update from upstream --- hypervideo_dl/extractor/__init__.py | 21 +- hypervideo_dl/extractor/abc.py | 67 +- hypervideo_dl/extractor/abematv.py | 476 +++ hypervideo_dl/extractor/adn.py | 30 +- hypervideo_dl/extractor/adobeconnect.py | 4 +- hypervideo_dl/extractor/adobepass.py | 61 +- hypervideo_dl/extractor/adobetv.py | 3 +- hypervideo_dl/extractor/afreecatv.py | 120 +- hypervideo_dl/extractor/aliexpress.py | 2 +- hypervideo_dl/extractor/aljazeera.py | 87 +- hypervideo_dl/extractor/allocine.py | 6 +- hypervideo_dl/extractor/alsace20tv.py | 87 + hypervideo_dl/extractor/alura.py | 9 +- hypervideo_dl/extractor/amazon.py | 53 + hypervideo_dl/extractor/animelab.py | 35 +- hypervideo_dl/extractor/animeondemand.py | 31 +- hypervideo_dl/extractor/ant1newsgr.py | 143 + hypervideo_dl/extractor/anvato.py | 7 +- hypervideo_dl/extractor/aparat.py | 15 +- hypervideo_dl/extractor/applepodcasts.py | 48 +- hypervideo_dl/extractor/archiveorg.py | 512 +++- hypervideo_dl/extractor/arcpublishing.py | 5 +- hypervideo_dl/extractor/ard.py | 76 +- hypervideo_dl/extractor/arnes.py | 3 +- hypervideo_dl/extractor/arte.py | 50 +- hypervideo_dl/extractor/asiancrush.py | 3 +- hypervideo_dl/extractor/atresplayer.py | 12 +- hypervideo_dl/extractor/atvat.py | 6 + hypervideo_dl/extractor/audiomack.py | 35 +- hypervideo_dl/extractor/awaan.py | 5 +- hypervideo_dl/extractor/azmedien.py | 10 +- hypervideo_dl/extractor/banbye.py | 153 + hypervideo_dl/extractor/bandaichannel.py | 1 - hypervideo_dl/extractor/bandcamp.py | 69 +- hypervideo_dl/extractor/bbc.py | 89 +- hypervideo_dl/extractor/beeg.py | 123 +- hypervideo_dl/extractor/bigo.py | 59 + hypervideo_dl/extractor/bilibili.py | 405 ++- hypervideo_dl/extractor/biqle.py | 93 +- hypervideo_dl/extractor/bitwave.py | 2 +- hypervideo_dl/extractor/blogger.py | 54 + hypervideo_dl/extractor/bongacams.py | 2 +- hypervideo_dl/extractor/br.py | 5 +- hypervideo_dl/extractor/breitbart.py | 38 + hypervideo_dl/extractor/brightcove.py | 40 +- hypervideo_dl/extractor/cableav.py | 34 + hypervideo_dl/extractor/callin.py | 114 + hypervideo_dl/extractor/caltrans.py | 41 + hypervideo_dl/extractor/cam4.py | 5 +- hypervideo_dl/extractor/cammodels.py | 2 +- hypervideo_dl/extractor/canalalpha.py | 98 + hypervideo_dl/extractor/canvas.py | 68 +- hypervideo_dl/extractor/carambatv.py | 3 +- hypervideo_dl/extractor/cbc.py | 182 +- hypervideo_dl/extractor/cbs.py | 28 +- hypervideo_dl/extractor/ccma.py | 13 +- hypervideo_dl/extractor/cctv.py | 3 +- hypervideo_dl/extractor/ceskatelevize.py | 130 +- hypervideo_dl/extractor/chaturbate.py | 2 +- hypervideo_dl/extractor/chingari.py | 4 +- hypervideo_dl/extractor/closertotruth.py | 3 +- hypervideo_dl/extractor/common.py | 468 ++- hypervideo_dl/extractor/corus.py | 1 - hypervideo_dl/extractor/coub.py | 3 +- hypervideo_dl/extractor/cozytv.py | 40 + hypervideo_dl/extractor/cpac.py | 148 + hypervideo_dl/extractor/crackle.py | 40 +- hypervideo_dl/extractor/craftsy.py | 71 + hypervideo_dl/extractor/crowdbunker.py | 113 + hypervideo_dl/extractor/crunchyroll.py | 359 ++- hypervideo_dl/extractor/cspan.py | 52 +- hypervideo_dl/extractor/ctvnews.py | 5 + hypervideo_dl/extractor/curiositystream.py | 84 +- hypervideo_dl/extractor/cybrary.py | 146 + hypervideo_dl/extractor/daftsex.py | 146 + hypervideo_dl/extractor/dailymotion.py | 33 +- hypervideo_dl/extractor/daum.py | 5 +- hypervideo_dl/extractor/daystar.py | 48 + hypervideo_dl/extractor/digitalconcerthall.py | 141 + hypervideo_dl/extractor/disney.py | 9 +- hypervideo_dl/extractor/dispeak.py | 3 +- hypervideo_dl/extractor/dlive.py | 2 +- hypervideo_dl/extractor/doodstream.py | 37 +- hypervideo_dl/extractor/douyutv.py | 2 +- hypervideo_dl/extractor/dplay.py | 857 ++++-- hypervideo_dl/extractor/drooble.py | 116 + hypervideo_dl/extractor/dropbox.py | 44 +- hypervideo_dl/extractor/dropout.py | 212 ++ hypervideo_dl/extractor/drtv.py | 18 +- hypervideo_dl/extractor/dvtv.py | 7 +- hypervideo_dl/extractor/egghead.py | 1 - hypervideo_dl/extractor/ellentube.py | 3 +- hypervideo_dl/extractor/elonet.py | 85 +- hypervideo_dl/extractor/engadget.py | 10 - hypervideo_dl/extractor/epicon.py | 4 +- hypervideo_dl/extractor/eroprofile.py | 9 +- hypervideo_dl/extractor/ertgr.py | 316 ++ hypervideo_dl/extractor/espn.py | 43 + hypervideo_dl/extractor/europeantour.py | 37 + hypervideo_dl/extractor/euscreen.py | 2 +- hypervideo_dl/extractor/extractors.py | 366 ++- hypervideo_dl/extractor/facebook.py | 158 +- hypervideo_dl/extractor/fancode.py | 41 +- hypervideo_dl/extractor/fc2.py | 201 +- hypervideo_dl/extractor/filmon.py | 2 +- hypervideo_dl/extractor/fivetv.py | 3 +- hypervideo_dl/extractor/flickr.py | 3 +- hypervideo_dl/extractor/fox.py | 39 +- hypervideo_dl/extractor/foxgay.py | 3 +- hypervideo_dl/extractor/fptplay.py | 102 + hypervideo_dl/extractor/franceculture.py | 101 +- hypervideo_dl/extractor/francetv.py | 6 +- hypervideo_dl/extractor/frontendmasters.py | 13 +- hypervideo_dl/extractor/fujitv.py | 70 +- hypervideo_dl/extractor/funimation.py | 25 +- hypervideo_dl/extractor/funk.py | 2 +- hypervideo_dl/extractor/gab.py | 89 +- hypervideo_dl/extractor/gaia.py | 30 +- hypervideo_dl/extractor/gamejolt.py | 541 ++++ hypervideo_dl/extractor/generic.py | 461 ++- hypervideo_dl/extractor/gettr.py | 159 +- hypervideo_dl/extractor/gfycat.py | 43 +- hypervideo_dl/extractor/glide.py | 4 +- hypervideo_dl/extractor/globo.py | 43 +- hypervideo_dl/extractor/glomex.py | 220 ++ hypervideo_dl/extractor/go.py | 8 +- hypervideo_dl/extractor/gofile.py | 83 + hypervideo_dl/extractor/googlesearch.py | 21 +- hypervideo_dl/extractor/gronkh.py | 5 +- hypervideo_dl/extractor/hellporno.py | 3 +- hypervideo_dl/extractor/hidive.py | 8 +- hypervideo_dl/extractor/hitbox.py | 2 +- hypervideo_dl/extractor/hotstar.py | 10 +- hypervideo_dl/extractor/hrfensehen.py | 10 +- hypervideo_dl/extractor/hrti.py | 15 +- hypervideo_dl/extractor/hse.py | 95 + hypervideo_dl/extractor/huffpost.py | 3 - hypervideo_dl/extractor/huya.py | 137 + hypervideo_dl/extractor/imdb.py | 64 +- hypervideo_dl/extractor/imggaming.py | 22 +- hypervideo_dl/extractor/infoq.py | 2 +- hypervideo_dl/extractor/instagram.py | 552 ++-- hypervideo_dl/extractor/internazionale.py | 6 - hypervideo_dl/extractor/iprima.py | 145 +- hypervideo_dl/extractor/iqiyi.py | 377 ++- hypervideo_dl/extractor/itprotv.py | 141 + hypervideo_dl/extractor/itv.py | 44 +- hypervideo_dl/extractor/ivideon.py | 2 +- hypervideo_dl/extractor/iwara.py | 3 +- hypervideo_dl/extractor/jamendo.py | 2 +- hypervideo_dl/extractor/joj.py | 3 +- hypervideo_dl/extractor/kakao.py | 46 +- hypervideo_dl/extractor/kaltura.py | 11 +- hypervideo_dl/extractor/keezmovies.py | 3 +- hypervideo_dl/extractor/kelbyone.py | 84 + hypervideo_dl/extractor/kinopoisk.py | 3 - hypervideo_dl/extractor/koo.py | 2 +- hypervideo_dl/extractor/la7.py | 54 +- hypervideo_dl/extractor/laola1tv.py | 4 +- hypervideo_dl/extractor/lastfm.py | 129 + hypervideo_dl/extractor/lbry.py | 43 +- hypervideo_dl/extractor/lecturio.py | 9 +- hypervideo_dl/extractor/lego.py | 7 +- hypervideo_dl/extractor/limelight.py | 2 +- hypervideo_dl/extractor/line.py | 112 +- hypervideo_dl/extractor/linkedin.py | 100 +- hypervideo_dl/extractor/linuxacademy.py | 9 +- hypervideo_dl/extractor/litv.py | 23 +- hypervideo_dl/extractor/livestream.py | 4 +- hypervideo_dl/extractor/lnkgo.py | 88 +- hypervideo_dl/extractor/lynda.py | 11 +- hypervideo_dl/extractor/mainstreaming.py | 219 ++ hypervideo_dl/extractor/mangomolo.py | 2 +- hypervideo_dl/extractor/manyvids.py | 1 + hypervideo_dl/extractor/matchtv.py | 2 +- hypervideo_dl/extractor/mdr.py | 12 +- hypervideo_dl/extractor/medaltv.py | 3 +- hypervideo_dl/extractor/mediaklikk.py | 4 +- hypervideo_dl/extractor/mediaset.py | 165 +- hypervideo_dl/extractor/mediasite.py | 11 +- hypervideo_dl/extractor/megatvcom.py | 173 ++ hypervideo_dl/extractor/mgtv.py | 59 +- hypervideo_dl/extractor/miaopai.py | 3 +- hypervideo_dl/extractor/microsoftstream.py | 125 + hypervideo_dl/extractor/mildom.py | 336 ++- hypervideo_dl/extractor/minds.py | 3 +- hypervideo_dl/extractor/mirrativ.py | 83 +- hypervideo_dl/extractor/mixch.py | 85 + hypervideo_dl/extractor/mixcloud.py | 16 +- hypervideo_dl/extractor/mlssoccer.py | 117 + hypervideo_dl/extractor/mojvideo.py | 3 +- hypervideo_dl/extractor/mtv.py | 17 +- hypervideo_dl/extractor/muenchentv.py | 2 +- hypervideo_dl/extractor/murrtube.py | 165 ++ hypervideo_dl/extractor/musescore.py | 8 +- hypervideo_dl/extractor/musicdex.py | 175 ++ hypervideo_dl/extractor/mxplayer.py | 2 +- hypervideo_dl/extractor/myspass.py | 63 +- hypervideo_dl/extractor/n1.py | 22 +- hypervideo_dl/extractor/nate.py | 124 + hypervideo_dl/extractor/naver.py | 7 +- hypervideo_dl/extractor/nba.py | 12 +- hypervideo_dl/extractor/nbc.py | 27 +- hypervideo_dl/extractor/ndr.py | 2 - hypervideo_dl/extractor/nebula.py | 368 +-- hypervideo_dl/extractor/neteasemusic.py | 13 +- hypervideo_dl/extractor/newgrounds.py | 25 +- hypervideo_dl/extractor/newstube.py | 10 +- hypervideo_dl/extractor/newsy.py | 51 + hypervideo_dl/extractor/nexx.py | 147 +- hypervideo_dl/extractor/nfb.py | 62 + hypervideo_dl/extractor/nfl.py | 2 +- hypervideo_dl/extractor/nhk.py | 152 +- hypervideo_dl/extractor/niconico.py | 823 +++--- hypervideo_dl/extractor/ninecninemedia.py | 35 +- hypervideo_dl/extractor/nitter.py | 221 +- hypervideo_dl/extractor/njpwworld.py | 19 +- hypervideo_dl/extractor/noco.py | 9 +- hypervideo_dl/extractor/noodlemagazine.py | 67 + hypervideo_dl/extractor/nova.py | 34 +- hypervideo_dl/extractor/novaplay.py | 4 +- hypervideo_dl/extractor/npo.py | 4 +- hypervideo_dl/extractor/npr.py | 3 +- hypervideo_dl/extractor/nrk.py | 13 +- hypervideo_dl/extractor/nrl.py | 1 - hypervideo_dl/extractor/ntvcojp.py | 27 +- hypervideo_dl/extractor/nuvid.py | 49 +- hypervideo_dl/extractor/odnoklassniki.py | 97 +- hypervideo_dl/extractor/oktoberfesttv.py | 4 +- hypervideo_dl/extractor/olympics.py | 71 +- hypervideo_dl/extractor/ondemandkorea.py | 6 +- hypervideo_dl/extractor/onefootball.py | 51 + hypervideo_dl/extractor/onet.py | 7 +- hypervideo_dl/extractor/opencast.py | 177 ++ hypervideo_dl/extractor/openload.py | 14 +- hypervideo_dl/extractor/openrec.py | 161 +- hypervideo_dl/extractor/orf.py | 231 +- hypervideo_dl/extractor/packtpub.py | 5 +- hypervideo_dl/extractor/panopto.py | 607 ++++ hypervideo_dl/extractor/paramountplus.py | 31 +- hypervideo_dl/extractor/parliamentliveuk.py | 3 - hypervideo_dl/extractor/patreon.py | 12 +- hypervideo_dl/extractor/pbs.py | 7 +- hypervideo_dl/extractor/peekvids.py | 81 + hypervideo_dl/extractor/peertube.py | 5 +- hypervideo_dl/extractor/peertv.py | 57 + hypervideo_dl/extractor/peloton.py | 1 - hypervideo_dl/extractor/periscope.py | 2 +- hypervideo_dl/extractor/piapro.py | 96 + hypervideo_dl/extractor/picarto.py | 4 +- hypervideo_dl/extractor/piksel.py | 10 +- hypervideo_dl/extractor/pixivsketch.py | 122 + hypervideo_dl/extractor/pladform.py | 26 +- hypervideo_dl/extractor/planetmarathi.py | 76 + hypervideo_dl/extractor/platzi.py | 9 +- hypervideo_dl/extractor/playplustv.py | 12 +- hypervideo_dl/extractor/playtvak.py | 2 - hypervideo_dl/extractor/playvid.py | 3 +- hypervideo_dl/extractor/pluralsight.py | 9 +- hypervideo_dl/extractor/plutotv.py | 7 +- hypervideo_dl/extractor/pokemon.py | 40 + hypervideo_dl/extractor/pokergo.py | 109 + hypervideo_dl/extractor/polsatgo.py | 90 + hypervideo_dl/extractor/polskieradio.py | 303 +- hypervideo_dl/extractor/pornez.py | 43 + hypervideo_dl/extractor/pornflip.py | 1 - hypervideo_dl/extractor/pornhub.py | 16 +- hypervideo_dl/extractor/projectveritas.py | 2 +- hypervideo_dl/extractor/prx.py | 431 +++ hypervideo_dl/extractor/radiode.py | 2 +- hypervideo_dl/extractor/radiokapital.py | 99 + hypervideo_dl/extractor/radiozet.py | 51 + hypervideo_dl/extractor/radlive.py | 10 +- hypervideo_dl/extractor/rai.py | 198 +- hypervideo_dl/extractor/rcti.py | 128 +- hypervideo_dl/extractor/redbulltv.py | 3 +- hypervideo_dl/extractor/reddit.py | 86 +- hypervideo_dl/extractor/redgifs.py | 232 ++ hypervideo_dl/extractor/redtube.py | 35 +- hypervideo_dl/extractor/rmcdecouverte.py | 1 - hypervideo_dl/extractor/rokfin.py | 256 ++ hypervideo_dl/extractor/roosterteeth.py | 208 +- hypervideo_dl/extractor/rtbf.py | 2 - hypervideo_dl/extractor/rtl2.py | 16 +- hypervideo_dl/extractor/rtnews.py | 199 ++ hypervideo_dl/extractor/rtrfm.py | 67 + hypervideo_dl/extractor/rtve.py | 95 +- hypervideo_dl/extractor/rtvs.py | 74 +- hypervideo_dl/extractor/rule34video.py | 65 + hypervideo_dl/extractor/rumble.py | 17 +- hypervideo_dl/extractor/rutube.py | 21 +- hypervideo_dl/extractor/rutv.py | 13 +- hypervideo_dl/extractor/ruutu.py | 15 + hypervideo_dl/extractor/ruv.py | 88 + hypervideo_dl/extractor/safari.py | 9 +- hypervideo_dl/extractor/sbs.py | 17 +- hypervideo_dl/extractor/scte.py | 9 +- hypervideo_dl/extractor/senategov.py | 213 ++ hypervideo_dl/extractor/sendtonews.py | 2 +- hypervideo_dl/extractor/sevenplus.py | 1 - hypervideo_dl/extractor/shahid.py | 8 +- hypervideo_dl/extractor/shemaroome.py | 11 +- hypervideo_dl/extractor/showroomlive.py | 2 +- hypervideo_dl/extractor/skeb.py | 143 + hypervideo_dl/extractor/sky.py | 28 + hypervideo_dl/extractor/skyit.py | 7 +- hypervideo_dl/extractor/skylinewebcams.py | 2 +- hypervideo_dl/extractor/skynewsau.py | 2 +- hypervideo_dl/extractor/slideslive.py | 3 - hypervideo_dl/extractor/sonyliv.py | 60 +- hypervideo_dl/extractor/soundcloud.py | 344 ++- hypervideo_dl/extractor/southpark.py | 17 +- hypervideo_dl/extractor/sovietscloset.py | 15 +- hypervideo_dl/extractor/spiegel.py | 2 +- hypervideo_dl/extractor/sportdeutschland.py | 8 +- hypervideo_dl/extractor/srgssr.py | 7 +- hypervideo_dl/extractor/steam.py | 140 +- hypervideo_dl/extractor/storyfire.py | 17 +- hypervideo_dl/extractor/streamcz.py | 173 +- hypervideo_dl/extractor/streamff.py | 31 + hypervideo_dl/extractor/stripchat.py | 66 + hypervideo_dl/extractor/stv.py | 5 +- hypervideo_dl/extractor/sunporno.py | 3 +- hypervideo_dl/extractor/svt.py | 32 +- hypervideo_dl/extractor/tagesschau.py | 279 +- hypervideo_dl/extractor/teachable.py | 3 +- hypervideo_dl/extractor/teamtreehouse.py | 7 +- hypervideo_dl/extractor/ted.py | 477 ++-- hypervideo_dl/extractor/tele5.py | 87 +- hypervideo_dl/extractor/telebruxelles.py | 2 +- hypervideo_dl/extractor/telegram.py | 37 + hypervideo_dl/extractor/telemundo.py | 5 +- hypervideo_dl/extractor/telequebec.py | 12 - hypervideo_dl/extractor/tennistv.py | 9 +- hypervideo_dl/extractor/tenplay.py | 44 +- hypervideo_dl/extractor/tf1.py | 1 - hypervideo_dl/extractor/theta.py | 10 +- hypervideo_dl/extractor/thisav.py | 4 +- hypervideo_dl/extractor/thisoldhouse.py | 17 +- hypervideo_dl/extractor/threeqsdn.py | 18 +- hypervideo_dl/extractor/threespeak.py | 97 + hypervideo_dl/extractor/tiktok.py | 449 ++- hypervideo_dl/extractor/toggo.py | 73 + hypervideo_dl/extractor/tokentube.py | 12 +- hypervideo_dl/extractor/tonline.py | 9 +- hypervideo_dl/extractor/toutv.py | 7 +- hypervideo_dl/extractor/traileraddict.py | 3 +- hypervideo_dl/extractor/trovo.py | 43 +- hypervideo_dl/extractor/trueid.py | 139 + hypervideo_dl/extractor/tubitv.py | 20 +- hypervideo_dl/extractor/tumblr.py | 408 ++- hypervideo_dl/extractor/tunein.py | 2 +- hypervideo_dl/extractor/turner.py | 2 +- hypervideo_dl/extractor/tv2.py | 17 +- hypervideo_dl/extractor/tv2dk.py | 17 +- hypervideo_dl/extractor/tver.py | 37 +- hypervideo_dl/extractor/tvnet.py | 7 +- hypervideo_dl/extractor/tvopengr.py | 128 + hypervideo_dl/extractor/tvp.py | 461 ++- hypervideo_dl/extractor/tvplay.py | 114 +- hypervideo_dl/extractor/tvplayer.py | 2 +- hypervideo_dl/extractor/twitcasting.py | 166 +- hypervideo_dl/extractor/twitch.py | 96 +- hypervideo_dl/extractor/twitter.py | 13 +- hypervideo_dl/extractor/udemy.py | 9 +- hypervideo_dl/extractor/uol.py | 1 - hypervideo_dl/extractor/urplay.py | 53 +- hypervideo_dl/extractor/ustream.py | 5 +- hypervideo_dl/extractor/utreon.py | 2 +- hypervideo_dl/extractor/varzesh3.py | 3 +- hypervideo_dl/extractor/veo.py | 47 +- hypervideo_dl/extractor/veoh.py | 62 +- hypervideo_dl/extractor/vgtv.py | 6 +- hypervideo_dl/extractor/vice.py | 1 - hypervideo_dl/extractor/videa.py | 9 +- hypervideo_dl/extractor/videocampus_sachsen.py | 96 + hypervideo_dl/extractor/vidio.py | 14 +- hypervideo_dl/extractor/vidlii.py | 50 +- hypervideo_dl/extractor/viewlift.py | 189 +- hypervideo_dl/extractor/viki.py | 30 +- hypervideo_dl/extractor/vimeo.py | 521 ++-- hypervideo_dl/extractor/vimm.py | 69 + hypervideo_dl/extractor/vine.py | 3 +- hypervideo_dl/extractor/viu.py | 226 +- hypervideo_dl/extractor/vk.py | 118 +- hypervideo_dl/extractor/vlive.py | 256 +- hypervideo_dl/extractor/voicy.py | 7 +- hypervideo_dl/extractor/voot.py | 2 +- hypervideo_dl/extractor/vrv.py | 67 +- hypervideo_dl/extractor/vshare.py | 3 +- hypervideo_dl/extractor/vupload.py | 12 +- hypervideo_dl/extractor/vyborymos.py | 4 +- hypervideo_dl/extractor/wakanim.py | 26 +- hypervideo_dl/extractor/wasdtv.py | 161 ++ hypervideo_dl/extractor/washingtonpost.py | 21 +- hypervideo_dl/extractor/watchbox.py | 2 - hypervideo_dl/extractor/wdr.py | 65 +- hypervideo_dl/extractor/webcaster.py | 8 +- hypervideo_dl/extractor/weibo.py | 3 +- hypervideo_dl/extractor/whowatch.py | 9 +- hypervideo_dl/extractor/willow.py | 58 + hypervideo_dl/extractor/wppilot.py | 177 ++ hypervideo_dl/extractor/xinpianchang.py | 95 + hypervideo_dl/extractor/xnxx.py | 5 +- hypervideo_dl/extractor/xvideos.py | 32 +- hypervideo_dl/extractor/yahoo.py | 46 +- hypervideo_dl/extractor/yandexvideo.py | 99 +- hypervideo_dl/extractor/youjizz.py | 3 +- hypervideo_dl/extractor/younow.py | 5 +- hypervideo_dl/extractor/youtube.py | 3657 ++++++++++++++++-------- hypervideo_dl/extractor/zattoo.py | 25 +- hypervideo_dl/extractor/zdf.py | 61 +- hypervideo_dl/extractor/zee5.py | 117 +- hypervideo_dl/extractor/zhihu.py | 4 +- hypervideo_dl/extractor/zingmp3.py | 159 +- hypervideo_dl/extractor/zoom.py | 40 +- 416 files changed, 25732 insertions(+), 7768 deletions(-) create mode 100644 hypervideo_dl/extractor/abematv.py create mode 100644 hypervideo_dl/extractor/alsace20tv.py create mode 100644 hypervideo_dl/extractor/amazon.py create mode 100644 hypervideo_dl/extractor/ant1newsgr.py create mode 100644 hypervideo_dl/extractor/banbye.py create mode 100644 hypervideo_dl/extractor/bigo.py create mode 100644 hypervideo_dl/extractor/blogger.py create mode 100644 hypervideo_dl/extractor/breitbart.py create mode 100644 hypervideo_dl/extractor/cableav.py create mode 100644 hypervideo_dl/extractor/callin.py create mode 100644 hypervideo_dl/extractor/caltrans.py create mode 100644 hypervideo_dl/extractor/canalalpha.py create mode 100644 hypervideo_dl/extractor/cozytv.py create mode 100644 hypervideo_dl/extractor/cpac.py create mode 100644 hypervideo_dl/extractor/craftsy.py create mode 100644 hypervideo_dl/extractor/crowdbunker.py create mode 100644 hypervideo_dl/extractor/cybrary.py create mode 100644 hypervideo_dl/extractor/daftsex.py create mode 100644 hypervideo_dl/extractor/daystar.py create mode 100644 hypervideo_dl/extractor/digitalconcerthall.py create mode 100644 hypervideo_dl/extractor/drooble.py create mode 100644 hypervideo_dl/extractor/dropout.py create mode 100644 hypervideo_dl/extractor/ertgr.py create mode 100644 hypervideo_dl/extractor/europeantour.py create mode 100644 hypervideo_dl/extractor/fptplay.py create mode 100644 hypervideo_dl/extractor/gamejolt.py create mode 100644 hypervideo_dl/extractor/glomex.py create mode 100644 hypervideo_dl/extractor/gofile.py create mode 100644 hypervideo_dl/extractor/hse.py create mode 100644 hypervideo_dl/extractor/huya.py create mode 100644 hypervideo_dl/extractor/itprotv.py create mode 100644 hypervideo_dl/extractor/kelbyone.py create mode 100644 hypervideo_dl/extractor/lastfm.py create mode 100644 hypervideo_dl/extractor/mainstreaming.py create mode 100644 hypervideo_dl/extractor/megatvcom.py create mode 100644 hypervideo_dl/extractor/microsoftstream.py create mode 100644 hypervideo_dl/extractor/mixch.py create mode 100644 hypervideo_dl/extractor/mlssoccer.py create mode 100644 hypervideo_dl/extractor/murrtube.py create mode 100644 hypervideo_dl/extractor/musicdex.py create mode 100644 hypervideo_dl/extractor/nate.py create mode 100644 hypervideo_dl/extractor/newsy.py create mode 100644 hypervideo_dl/extractor/nfb.py create mode 100644 hypervideo_dl/extractor/noodlemagazine.py create mode 100644 hypervideo_dl/extractor/onefootball.py create mode 100644 hypervideo_dl/extractor/opencast.py create mode 100644 hypervideo_dl/extractor/panopto.py create mode 100644 hypervideo_dl/extractor/peekvids.py create mode 100644 hypervideo_dl/extractor/peertv.py create mode 100644 hypervideo_dl/extractor/piapro.py create mode 100644 hypervideo_dl/extractor/pixivsketch.py create mode 100644 hypervideo_dl/extractor/planetmarathi.py create mode 100644 hypervideo_dl/extractor/pokergo.py create mode 100644 hypervideo_dl/extractor/polsatgo.py create mode 100644 hypervideo_dl/extractor/pornez.py create mode 100644 hypervideo_dl/extractor/prx.py create mode 100644 hypervideo_dl/extractor/radiokapital.py create mode 100644 hypervideo_dl/extractor/radiozet.py create mode 100644 hypervideo_dl/extractor/redgifs.py create mode 100644 hypervideo_dl/extractor/rokfin.py create mode 100644 hypervideo_dl/extractor/rtnews.py create mode 100644 hypervideo_dl/extractor/rtrfm.py create mode 100644 hypervideo_dl/extractor/rule34video.py create mode 100644 hypervideo_dl/extractor/senategov.py create mode 100644 hypervideo_dl/extractor/skeb.py create mode 100644 hypervideo_dl/extractor/streamff.py create mode 100644 hypervideo_dl/extractor/stripchat.py create mode 100644 hypervideo_dl/extractor/telegram.py create mode 100644 hypervideo_dl/extractor/threespeak.py create mode 100644 hypervideo_dl/extractor/toggo.py create mode 100644 hypervideo_dl/extractor/trueid.py create mode 100644 hypervideo_dl/extractor/tvopengr.py create mode 100644 hypervideo_dl/extractor/videocampus_sachsen.py create mode 100644 hypervideo_dl/extractor/vimm.py create mode 100644 hypervideo_dl/extractor/wasdtv.py create mode 100644 hypervideo_dl/extractor/willow.py create mode 100644 hypervideo_dl/extractor/wppilot.py create mode 100644 hypervideo_dl/extractor/xinpianchang.py (limited to 'hypervideo_dl/extractor') diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index 198c4ae..b354842 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,14 +1,15 @@ -from __future__ import unicode_literals +import os from ..utils import load_plugins -try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - _PLUGIN_CLASSES = {} -except ImportError: - _LAZY_LOADER = False +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + try: + from .lazy_extractors import * + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True + except ImportError: + pass if not _LAZY_LOADER: from .extractors import * @@ -19,8 +20,8 @@ if not _LAZY_LOADER: ] _ALL_CLASSES.append(GenericIE) - _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) - _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) +_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES def gen_extractor_classes(): diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 3e20216..6fe195e 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + dict_get, ExtractorError, js_to_json, int_or_none, @@ -212,7 +213,7 @@ class ABCIViewIE(InfoExtractor): 'hdnea': token, }) - for sd in ('720', 'sd', 'sd-low'): + for sd in ('1080', '720', 'sd', 'sd-low'): sd_url = try_get( stream, lambda x: x['streams']['hls'][sd], compat_str) if not sd_url: @@ -233,8 +234,6 @@ class ABCIViewIE(InfoExtractor): }] is_live = video_params.get('livestream') == '1' - if is_live: - title = self._live_title(title) return { 'id': video_id, @@ -255,3 +254,65 @@ class ABCIViewIE(InfoExtractor): 'subtitles': subtitles, 'is_live': is_live, } + + +class ABCIViewShowSeriesIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview:showseries' + _VALID_URL = r'https?://iview\.abc\.net\.au/show/(?P[^/]+)(?:/series/\d+)?$' + _GEO_COUNTRIES = ['AU'] + + _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': '124870-1', + 'title': 'Series 1', + 'description': 'md5:93119346c24a7c322d446d8eece430ff', + 'series': 'Upper Middle Bogan', + 'season': 'Series 1', + 'thumbnail': r're:^https?://cdn\.iview\.abc\.net\.au/thumbs/.*\.jpg$' + }, + 'playlist_count': 8, + }, { + 'url': 'https://iview.abc.net.au/show/upper-middle-bogan', + 'info_dict': { + 'id': 'CO1108V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 I\'m A Swan', + 'description': 'md5:7b676758c1de11a30b79b4d301e8da93', + 'series': 'Upper Middle Bogan', + 'uploader_id': 'abc1', + 'upload_date': '20210630', + 'timestamp': 1625036400, + }, + 'params': { + 'noplaylist': True, + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + webpage_data = self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*[\'"](.+?)[\'"]\s*;', + webpage, 'initial state') + video_data = self._parse_json( + unescapeHTML(webpage_data).encode('utf-8').decode('unicode_escape'), show_id) + video_data = video_data['route']['pageData']['_embedded'] + + highlight = try_get(video_data, lambda x: x['highlightVideo']['shareUrl']) + if not self._yes_playlist(show_id, bool(highlight), video_label='highlight video'): + return self.url_result(highlight, ie=ABCIViewIE.ie_key()) + + series = video_data['selectedSeries'] + return { + '_type': 'playlist', + 'entries': [self.url_result(episode['shareUrl']) + for episode in series['_embedded']['videoEpisodes']], + 'id': series.get('id'), + 'title': dict_get(series, ('title', 'displaySubtitle')), + 'description': series.get('description'), + 'series': dict_get(series, ('showTitle', 'displayTitle')), + 'season': dict_get(series, ('title', 'displaySubtitle')), + 'thumbnail': series.get('thumbnail'), + } diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py new file mode 100644 index 0000000..27b7d86 --- /dev/null +++ b/hypervideo_dl/extractor/abematv.py @@ -0,0 +1,476 @@ +import io +import json +import time +import hashlib +import hmac +import re +import struct +from base64 import urlsafe_b64encode +from binascii import unhexlify + +from .common import InfoExtractor +from ..aes import aes_ecb_decrypt +from ..compat import ( + compat_urllib_response, + compat_urllib_parse_urlparse, + compat_urllib_request, +) +from ..utils import ( + ExtractorError, + decode_base, + int_or_none, + random_uuidv4, + request_to_url, + time_seconds, + update_url_query, + traverse_obj, + intlist_to_bytes, + bytes_to_intlist, + urljoin, +) + + +# NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + +def add_opener(ydl, handler): + ''' Add a handler for opening URLs, like _download_webpage ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + ydl._opener.add_handler(handler) + + +def remove_opener(ydl, handler): + ''' + Remove handler(s) for opening URLs + @param handler Either handler object itself or handler type. + Specifying handler type will remove all handler which isinstance returns True. + ''' + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 + # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 + opener = ydl._opener + assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + if isinstance(handler, (type, tuple)): + find_cp = lambda x: isinstance(x, handler) + else: + find_cp = lambda x: x is handler + + removed = [] + for meth in dir(handler): + if meth in ["redirect_request", "do_open", "proxy_open"]: + # oops, coincidental match + continue + + i = meth.find("_") + protocol = meth[:i] + condition = meth[i + 1:] + + if condition.startswith("error"): + j = condition.find("_") + i + 1 + kind = meth[j + 1:] + try: + kind = int(kind) + except ValueError: + pass + lookup = opener.handle_error.get(protocol, {}) + opener.handle_error[protocol] = lookup + elif condition == "open": + kind = protocol + lookup = opener.handle_open + elif condition == "response": + kind = protocol + lookup = opener.process_response + elif condition == "request": + kind = protocol + lookup = opener.process_request + else: + continue + + handlers = lookup.setdefault(kind, []) + if handlers: + handlers[:] = [x for x in handlers if not find_cp(x)] + + removed.append(x for x in handlers if find_cp(x)) + + if removed: + for x in opener.handlers: + if find_cp(x): + x.add_parent(None) + opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] + + +class AbemaLicenseHandler(compat_urllib_request.BaseHandler): + handler_order = 499 + STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' + HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' + + def __init__(self, ie: 'AbemaTVIE'): + # the protcol that this should really handle is 'abematv-license://' + # abematv_license_open is just a placeholder for development purposes + # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 + setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) + self.ie = ie + + def _get_videokey_from_ticket(self, ticket): + to_show = self.ie._downloader.params.get('verbose', False) + media_token = self.ie._get_media_token(to_show=to_show) + + license_response = self.ie._download_json( + 'https://license.abema.io/abematv-hls', None, note='Requesting playback license' if to_show else False, + query={'t': media_token}, + data=json.dumps({ + 'kv': 'a', + 'lt': ticket + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + + res = decode_base(license_response['k'], self.STRTABLE) + encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) + + h = hmac.new( + unhexlify(self.HKEY), + (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), + digestmod=hashlib.sha256) + enckey = bytes_to_intlist(h.digest()) + + return intlist_to_bytes(aes_ecb_decrypt(encvideokey, enckey)) + + def abematv_license_open(self, url): + url = request_to_url(url) + ticket = compat_urllib_parse_urlparse(url).netloc + response_data = self._get_videokey_from_ticket(ticket) + return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + 'Content-Length': len(response_data), + }, url=url, code=200) + + +class AbemaTVBaseIE(InfoExtractor): + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + if jsonld.get('@type') != 'BreadcrumbList': + continue + trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if trav: + return trav + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?Pnow-on-air|video/episode|channels/.+?/slots)/(?P[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _USERTOKEN = None + _DEVICE_ID = None + _TIMETABLE = None + _MEDIATOKEN = None + + _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' + + def _generate_aks(self, deviceid): + deviceid = deviceid.encode('utf-8') + # add 1 hour and then drop minute and secs + ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) + time_struct = time.gmtime(ts_1hour) + ts_1hour_str = str(ts_1hour).encode('utf-8') + + tmp = None + + def mix_once(nonce): + nonlocal tmp + h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h.update(nonce) + tmp = h.digest() + + def mix_tmp(count): + nonlocal tmp + for i in range(count): + mix_once(tmp) + + def mix_twist(nonce): + nonlocal tmp + mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + + mix_once(self._SECRETKEY) + mix_tmp(time_struct.tm_mon) + mix_twist(deviceid) + mix_tmp(time_struct.tm_mday % 5) + mix_twist(ts_1hour_str) + mix_tmp(time_struct.tm_hour % 5) + + return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + + def _get_device_token(self): + if self._USERTOKEN: + return self._USERTOKEN + + self._DEVICE_ID = random_uuidv4() + aks = self._generate_aks(self._DEVICE_ID) + user_data = self._download_json( + 'https://api.abema.io/v1/users', None, note='Authorizing', + data=json.dumps({ + 'deviceId': self._DEVICE_ID, + 'applicationKeySecret': aks, + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + self._USERTOKEN = user_data['token'] + + # don't allow adding it 2 times or more, though it's guarded + remove_opener(self._downloader, AbemaLicenseHandler) + add_opener(self._downloader, AbemaLicenseHandler(self)) + + return self._USERTOKEN + + def _get_media_token(self, invalidate=False, to_show=True): + if not invalidate and self._MEDIATOKEN: + return self._MEDIATOKEN + + self._MEDIATOKEN = self._download_json( + 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, + query={ + 'osName': 'android', + 'osVersion': '6.0.1', + 'osLang': 'ja_JP', + 'osTimezone': 'Asia/Tokyo', + 'appId': 'tv.abema', + 'appVersion': '3.27.1' + }, headers={ + 'Authorization': 'bearer ' + self._get_device_token() + })['token'] + + return self._MEDIATOKEN + + def _perform_login(self, username, password): + if '@' in username: # don't strictly check if it's email address or not + ep, method = 'user/email', 'email' + else: + ep, method = 'oneTimePassword', 'userId' + + login_response = self._download_json( + f'https://api.abema.io/v1/auth/{ep}', None, note='Logging in', + data=json.dumps({ + method: username, + 'password': password + }).encode('utf-8'), headers={ + 'Authorization': 'bearer ' + self._get_device_token(), + 'Origin': 'https://abema.tv', + 'Referer': 'https://abema.tv/', + 'Content-Type': 'application/json', + }) + + self._USERTOKEN = login_response['token'] + self._get_media_token(True) + + def _real_extract(self, url): + # starting download using infojson from this extractor is undefined behavior, + # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # (unless there's a way to hook before downloading by extractor) + video_id, video_type = self._match_valid_url(url).group('id', 'type') + headers = { + 'Authorization': 'Bearer ' + self._get_device_token(), + } + video_type = video_type.split('/')[-1] + + webpage = self._download_webpage(url, video_id) + canonical_url = self._search_regex( + r'(.+?)', webpage, 'title', default=None) + if not title: + jsonld = None + for jld in re.finditer( + r'(?is)(?:)?]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if jsonld: + break + if jsonld: + title = jsonld.get('caption') + if not title and video_type == 'now-on-air': + if not self._TIMETABLE: + # cache the timetable because it goes to 5MiB in size (!!) + self._TIMETABLE = self._download_json( + 'https://api.abema.io/v1/timetable/dataSet?debug=false', video_id, + headers=headers) + now = time_seconds(hours=9) + for slot in self._TIMETABLE.get('slots', []): + if slot.get('channelId') != video_id: + continue + if slot['startAt'] <= now and now < slot['endAt']: + title = slot['title'] + break + + # read breadcrumb on top of page + breadcrumb = self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + # breadcrumb list translates to: (example is 1st test for this IE) + # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) + # hence this works + info['series'] = breadcrumb[-2] + info['episode'] = breadcrumb[-1] + if not title: + title = info['episode'] + + description = self._html_search_regex( + (r'(.+?)

(.+?)[^?/]+)' + + _TESTS = [{ + 'url': 'https://abema.tv/video/title/90-1597', + 'info_dict': { + 'id': '90-1597', + 'title': 'シャッフルアイランド', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://abema.tv/video/title/193-132', + 'info_dict': { + 'id': '193-132', + 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', + }, + 'playlist_mincount': 16, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) + if breadcrumb: + playlist_title = breadcrumb[-1] + + playlist = [ + self.url_result(urljoin('https://abema.tv/', mobj.group(1))) + for mobj in re.finditer(r'', '{\\i1}').replace('', '{\\i0}')) @@ -133,10 +126,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles - def _real_initialize(self): - username, password = self._get_login_info() - if not username: - return + def _perform_login(self, username, password): try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index 728549e..e2e6f93 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -14,7 +14,7 @@ class AdobeConnectIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') + title = self._html_extract_title(webpage) qs = compat_parse_qs(self._search_regex(r"swfUrl\s*=\s*'([^']+)'", webpage, 'swf url').split('?')[1]) is_live = qs.get('isLive', ['false'])[0] == 'true' formats = [] @@ -31,7 +31,7 @@ class AdobeConnectIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'formats': formats, 'is_live': is_live, } diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 9378c33..5d98301 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -39,8 +39,8 @@ MSO_INFO = { }, 'RCN': { 'name': 'RCN', - 'username_field': 'UserName', - 'password_field': 'UserPassword', + 'username_field': 'username', + 'password_field': 'password', }, 'Rogers': { 'name': 'Rogers', @@ -1345,6 +1345,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'Suddenlink': { + 'name': 'Suddenlink', + 'username_field': 'username', + 'password_field': 'password', + }, } @@ -1635,6 +1640,58 @@ class AdobePassIE(InfoExtractor): urlh.geturl(), video_id, 'Sending final bookend', query=hidden_data) + post_form(mvpd_confirm_page_res, 'Confirming Login') + elif mso_id == 'Suddenlink': + # Suddenlink is similar to SlingTV in using a tab history count and a meta refresh, + # but they also do a dynmaic redirect using javascript that has to be followed as well + first_bookend_page, urlh = post_form( + provider_redirect_page_res, 'Pressing Continue...') + + hidden_data = self._hidden_inputs(first_bookend_page) + hidden_data['history_val'] = 1 + + provider_login_redirect_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending First Bookend', + query=hidden_data) + + provider_login_redirect_page, urlh = provider_login_redirect_page_res + + # Some website partners seem to not have the extra ajaxurl redirect step, so we check if we already + # have the login prompt or not + if 'id="password" type="password" name="password"' in provider_login_redirect_page: + provider_login_page_res = provider_login_redirect_page_res + else: + provider_tryauth_url = self._html_search_regex( + r'url:\s*[\'"]([^\'"]+)', provider_login_redirect_page, 'ajaxurl') + provider_tryauth_page = self._download_webpage( + provider_tryauth_url, video_id, 'Submitting TryAuth', + query=hidden_data) + + provider_login_page_res = self._download_webpage_handle( + f'https://authorize.suddenlink.net/saml/module.php/authSynacor/login.php?AuthState={provider_tryauth_page}', + video_id, 'Getting Login Page', + query=hidden_data) + + provider_association_redirect, urlh = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password + }) + + provider_refresh_redirect_url = extract_redirect_url( + provider_association_redirect, url=urlh.geturl()) + + last_bookend_page, urlh = self._download_webpage_handle( + provider_refresh_redirect_url, video_id, + 'Downloading Auth Association Redirect Page') + + hidden_data = self._hidden_inputs(last_bookend_page) + hidden_data['history_val'] = 3 + + mvpd_confirm_page_res = self._download_webpage_handle( + urlh.geturl(), video_id, 'Sending Final Bookend', + query=hidden_data) + post_form(mvpd_confirm_page_res, 'Confirming Login') else: # Some providers (e.g. DIRECTV NOW) have another meta refresh diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py index 12b8192..3cfa1ff 100644 --- a/hypervideo_dl/extractor/adobetv.py +++ b/hypervideo_dl/extractor/adobetv.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 063872b..77f0e3c 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -10,7 +10,11 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + qualities, + traverse_obj, unified_strdate, + unified_timestamp, + update_url_query, url_or_none, urlencode_postdata, xpath_text, @@ -28,7 +32,7 @@ class AfreecaTVIE(InfoExtractor): /app/(?:index|read_ucc_bbs)\.cgi| /player/[Pp]layer\.(?:swf|html) )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/PLAYER/STATION/ + vod\.afreecatv\.com/(PLAYER/STATION|player)/ ) (?P\d+) ''' @@ -166,6 +170,9 @@ class AfreecaTVIE(InfoExtractor): }, { 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/player/15055030', + 'only_matching': True, }] @staticmethod @@ -177,14 +184,7 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'szWork': 'login', 'szType': 'json', @@ -380,3 +380,105 @@ class AfreecaTVIE(InfoExtractor): }) return info + + +class AfreecaTVLiveIE(AfreecaTVIE): + + IE_NAME = 'afreecatv:live' + _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' + _TESTS = [{ + 'url': 'https://play.afreecatv.com/pyh3646/237852185', + 'info_dict': { + 'id': '237852185', + 'ext': 'mp4', + 'title': '【 우루과이 오늘은 무슨일이? 】', + 'uploader': '박진우[JINU]', + 'uploader_id': 'pyh3646', + 'timestamp': 1640661495, + 'is_live': True, + }, + 'skip': 'Livestream has ended', + }, { + 'url': 'http://play.afreeca.com/pyh3646/237852185', + 'only_matching': True, + }, { + 'url': 'http://play.afreeca.com/pyh3646', + 'only_matching': True, + }] + + _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + + _QUALITIES = ('sd', 'hd', 'hd2k', 'original') + + def _real_extract(self, url): + broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') + password = self.get_param('videopassword') + + info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False, + data=urlencode_postdata({'bid': broadcaster_id})) or {} + channel_info = info.get('CHANNEL') or {} + broadcaster_id = channel_info.get('BJID') or broadcaster_id + broadcast_no = channel_info.get('BNO') or broadcast_no + password_protected = channel_info.get('BPWD') + if not broadcast_no: + raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True) + if password_protected == 'Y' and password is None: + raise ExtractorError( + 'This livestream is protected by a password, use the --video-password option', + expected=True) + + formats = [] + quality_key = qualities(self._QUALITIES) + for quality_str in self._QUALITIES: + params = { + 'bno': broadcast_no, + 'stream_type': 'common', + 'type': 'aid', + 'quality': quality_str, + } + if password is not None: + params['pwd'] = password + aid_response = self._download_json( + self._LIVE_API_URL, broadcast_no, fatal=False, + data=urlencode_postdata(params), + note=f'Downloading access token for {quality_str} stream', + errnote=f'Unable to download access token for {quality_str} stream') + aid = traverse_obj(aid_response, ('CHANNEL', 'AID')) + if not aid: + continue + + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + stream_info = self._download_json( + f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False, + query={ + 'return_type': channel_info.get('CDN', 'gcp_cdn'), + 'broad_key': f'{broadcast_no}-common-{quality_str}-hls', + }, + note=f'Downloading metadata for {quality_str} stream', + errnote=f'Unable to download metadata for {quality_str} stream') or {} + + if stream_info.get('view_url'): + formats.append({ + 'format_id': quality_str, + 'url': update_url_query(stream_info['view_url'], {'aid': aid}), + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': quality_key(quality_str), + }) + + self._sort_formats(formats) + + station_info = self._download_json( + 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, + query={'szBjId': broadcaster_id}, fatal=False, + note='Downloading channel metadata', errnote='Unable to download channel metadata') or {} + + return { + 'id': broadcast_no, + 'title': channel_info.get('TITLE') or station_info.get('station_title'), + 'uploader': channel_info.get('BJNICK') or station_info.get('station_name'), + 'uploader_id': broadcaster_id, + 'timestamp': unified_timestamp(station_info.get('broad_start')), + 'formats': formats, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 6f241e6..9722fe9 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -18,7 +18,7 @@ class AliExpressLiveIE(InfoExtractor): 'id': '2800002704436634', 'ext': 'mp4', 'title': 'CASIMA7.22', - 'thumbnail': r're:http://.*\.jpg', + 'thumbnail': r're:https?://.*\.jpg', 'uploader': 'CASIMA Official Store', 'timestamp': 1500717600, 'upload_date': '20170722', diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index e829b45..7bcdb7a 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' + _VALID_URL = r'https?://(?P\w+\.aljazeera\.\w+)/(?Pprograms?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P\d+)/(?P[a-zA-Z0-9]+)_(?P[^/]+)/index.html\?videoId=(?P\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index cd533ac..403a277 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, qualities, remove_end, + strip_or_none, try_get, unified_timestamp, url_basename, @@ -102,10 +103,7 @@ class AllocineIE(InfoExtractor): video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) - title = remove_end( - self._html_search_regex( - r'(?s)(.+?)', webpage, 'title').strip(), - ' - AlloCiné') + title = remove_end(strip_or_none(self._html_extract_title(webpage), ' - AlloCiné')) for key, value in media_data['video'].items(): if not key.endswith('Path'): continue diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py new file mode 100644 index 0000000..4aae6fe --- /dev/null +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + get_element_by_class, + int_or_none, + unified_strdate, + url_or_none, +) + + +class Alsace20TVBaseIE(InfoExtractor): + def _extract_video(self, video_id, url=None): + info = self._download_json( + 'https://www.alsace20.tv/visionneuse/visio_v9_js.php?key=%s&habillage=0&mode=html' % (video_id, ), + video_id) or {} + title = info.get('titre') + + formats = [] + for res, fmt_url in (info.get('files') or {}).items(): + formats.extend( + self._extract_smil_formats(fmt_url, video_id, fatal=False) + if '/smil:_' in fmt_url + else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) + self._sort_formats(formats) + + webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' + thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) + upload_date = self._search_regex(r'/(\d{6})_', thumbnail, 'upload_date', default=None) + upload_date = unified_strdate('20%s-%s-%s' % (upload_date[:2], upload_date[2:4], upload_date[4:])) if upload_date else None + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': clean_html(get_element_by_class('wysiwyg', webpage)), + 'upload_date': upload_date, + 'thumbnail': thumbnail, + 'duration': int_or_none(self._og_search_property('video:duration', webpage) if webpage else None), + 'view_count': int_or_none(info.get('nb_vues')), + } + + +class Alsace20TVIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/(?:[\w-]+/)+[\w-]+-(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/VOD/Actu/JT/Votre-JT-jeudi-3-fevrier-lyNHCXpYJh.html', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'description': 'md5:fc0bc4a0692d3d2dba4524053de4c7b7', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'duration': 1073, + 'view_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id, url) + + +class Alsace20TVEmbedIE(Alsace20TVBaseIE): + _VALID_URL = r'https?://(?:www\.)?alsace20\.tv/emb/(?P[\w]+)' + _TESTS = [{ + 'url': 'https://www.alsace20.tv/emb/lyNHCXpYJh', + # 'md5': 'd91851bf9af73c0ad9b2cdf76c127fbb', + 'info_dict': { + 'id': 'lyNHCXpYJh', + 'ext': 'mp4', + 'title': 'Votre JT du jeudi 3 février', + 'upload_date': '20220203', + 'thumbnail': r're:https?://.+\.jpg', + 'view_count': int, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video(video_id) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index f5325de..d2e2df2 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -74,14 +74,7 @@ class AluraIE(InfoExtractor): "formats": formats } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - pass + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py new file mode 100644 index 0000000..07b1b18 --- /dev/null +++ b/hypervideo_dl/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py index 4fb7ee4..1c2cc47 100644 --- a/hypervideo_dl/extractor/animelab.py +++ b/hypervideo_dl/extractor/animelab.py @@ -15,25 +15,21 @@ from ..compat import compat_HTTPError class AnimeLabBaseIE(InfoExtractor): - _LOGIN_REQUIRED = True _LOGIN_URL = 'https://www.animelab.com/login' _NETRC_MACHINE = 'animelab' + _LOGGED_IN = False - def _login(self): - def is_logged_in(login_webpage): - return 'Sign In' not in login_webpage + def _is_logged_in(self, login_page=None): + if not self._LOGGED_IN: + if not login_page: + login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page + return self._LOGGED_IN - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - # Check if already logged in - if is_logged_in(login_page): + def _perform_login(self, username, password): + if self._is_logged_in(): return - (username, password) = self._get_login_info() - if username is None and self._LOGIN_REQUIRED: - self.raise_login_required('Login is required to access any AnimeLab content') - login_form = { 'email': username, 'password': password, @@ -47,17 +43,14 @@ class AnimeLabBaseIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - else: - raise + raise - # if login was successful - if is_logged_in(response): - return - - raise ExtractorError('Unable to login (cannot verify if logged in)') + if not self._is_logged_in(response): + raise ExtractorError('Unable to login (cannot verify if logged in)') def _real_initialize(self): - self._login() + if not self._is_logged_in(): + self.raise_login_required('Login is required to access any AnimeLab content') class AnimeLabIE(AnimeLabBaseIE): diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py index 54e097d..2e674d5 100644 --- a/hypervideo_dl/extractor/animeondemand.py +++ b/hypervideo_dl/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -52,11 +53,7 @@ class AnimeOnDemandIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -92,9 +89,6 @@ class AnimeOnDemandIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - def _real_extract(self, url): anime_id = self._match_id(url) @@ -140,15 +134,8 @@ class AnimeOnDemandIE(InfoExtractor): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +182,10 @@ class AnimeOnDemandIE(InfoExtractor): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py new file mode 100644 index 0000000..1075b46 --- /dev/null +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + determine_ext, + scale_thumbnails_to_max_format_width, + unescapeHTML, +) + + +class Ant1NewsGrBaseIE(InfoExtractor): + def _download_and_extract_api_data(self, video_id, netloc, cid=None): + url = f'{self.http_scheme()}//{netloc}{self._API_PATH}' + info = self._download_json(url, video_id, query={'cid': cid or video_id}) + try: + source = info['url'] + except KeyError: + raise ExtractorError('no source found for %s' % video_id) + formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') + if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) + self._sort_formats(formats) + thumbnails = scale_thumbnails_to_max_format_width( + formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') + return { + 'id': video_id, + 'title': info.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subs, + } + + +class Ant1NewsGrWatchIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:watch' + IE_DESC = 'ant1news.gr videos' + _VALID_URL = r'https?://(?P(?:www\.)?ant1news\.gr)/watch/(?P\d+)/' + _API_PATH = '/templates/data/player' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/watch/1506168/ant1-news-09112021-stis-18-45', + 'md5': '95925e6b32106754235f2417e0d2dfab', + 'info_dict': { + 'id': '1506168', + 'ext': 'mp4', + 'title': 'md5:0ad00fa66ecf8aa233d26ab0dba7514a', + 'description': 'md5:18665af715a6dcfeac1d6153a44f16b0', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/26d46bf6-8158-4f02-b197-7096c714b2de.jpg', + }, + }] + + def _real_extract(self, url): + video_id, netloc = self._match_valid_url(url).group('id', 'netloc') + webpage = self._download_webpage(url, video_id) + info = self._download_and_extract_api_data(video_id, netloc) + info['description'] = self._og_search_description(webpage) + return info + + +class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:article' + IE_DESC = 'ant1news.gr articles' + _VALID_URL = r'https?://(?:www\.)?ant1news\.gr/[^/]+/article/(?P\d+)/' + + _TESTS = [{ + 'url': 'https://www.ant1news.gr/afieromata/article/549468/o-tzeims-mpont-sta-meteora-oi-apeiles-kai-o-xesikomos-ton-kalogeron', + 'md5': '294f18331bb516539d72d85a82887dcc', + 'info_dict': { + 'id': '_xvg/m_cmbatw=', + 'ext': 'mp4', + 'title': 'md5:a93e8ecf2e4073bfdffcb38f59945411', + 'timestamp': 1603092840, + 'upload_date': '20201019', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/756206d2-d640-40e2-b201-3555abdfc0db.jpg', + }, + }, { + 'url': 'https://ant1news.gr/Society/article/620286/symmoria-anilikon-dikigoros-thymaton-ithelan-na-toys-apoteleiosoyn', + 'info_dict': { + 'id': '620286', + 'title': 'md5:91fe569e952e4d146485740ae927662b', + }, + 'playlist_mincount': 2, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') + embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + if not embed_urls: + raise ExtractorError('no videos found for %s' % video_id, expected=True) + return self.playlist_from_matches( + embed_urls, video_id, info.get('title'), ie=Ant1NewsGrEmbedIE.ie_key(), + video_kwargs={'url_transparent': True, 'timestamp': info.get('timestamp')}) + + +class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): + IE_NAME = 'ant1newsgr:embed' + IE_DESC = 'ant1news.gr embedded videos' + _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' + _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P[^#&]+)' + _API_PATH = '/news/templates/data/jsonPlayer' + + _TESTS = [{ + 'url': 'https://www.antenna.gr/templates/pages/player?cid=3f_li_c_az_jw_y_u=&w=670&h=377', + 'md5': 'dfc58c3a11a5a9aad2ba316ed447def3', + 'info_dict': { + 'id': '3f_li_c_az_jw_y_u=', + 'ext': 'mp4', + 'title': 'md5:a30c93332455f53e1e84ae0724f0adf7', + 'thumbnail': 'https://ant1media.azureedge.net/imgHandler/640/bbe31201-3f09-4a4e-87f5-8ad2159fffe2.jpg', + }, + }] + + @classmethod + def _extract_urls(cls, webpage): + _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' + _EMBED_RE = rf']+?src=(?P<_q1>["\'])(?P{_EMBED_URL_RE})(?P=_q1)' + for mobj in re.finditer(_EMBED_RE, webpage): + url = unescapeHTML(mobj.group('url')) + if not cls.suitable(url): + continue + yield url + + def _real_extract(self, url): + video_id = self._match_id(url) + + canonical_url = self._request_webpage( + HEADRequest(url), video_id, + note='Resolve canonical player URL', + errnote='Could not resolve canonical player URL').geturl() + _, netloc, _, _, query, _ = urllib.parse.urlparse(canonical_url) + cid = urllib.parse.parse_qs(query)['cid'][0] + + return self._download_and_extract_api_data(video_id, netloc, cid=cid) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index b82f0b5..686d453 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index da06a3c..1057233 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -33,19 +33,22 @@ class AparatIE(InfoExtractor): 'only_matching': True, }] + def _parse_options(self, webpage, video_id, fatal=True): + return self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options', default='{}'), video_id) + def _real_extract(self, url): video_id = self._match_id(url) - # Provides more metadata + # If available, provides more metadata webpage = self._download_webpage(url, video_id, fatal=False) + options = self._parse_options(webpage, video_id, fatal=False) - if not webpage: + if not options: webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, - video_id) - - options = self._parse_json(self._search_regex( - r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) + video_id, 'Downloading embed webpage') + options = self._parse_options(webpage, video_id) formats = [] for sources in (options.get('multiSRC') or []): diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 6a74de7..9139ff7 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -3,7 +3,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, + get_element_by_class, int_or_none, parse_iso8601, try_get, @@ -14,16 +16,17 @@ class ApplePodcastsIE(InfoExtractor): _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' _TESTS = [{ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', - 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'md5': '41dc31cd650143e530d9423b6b5a344f', 'info_dict': { 'id': '1000482637777', 'ext': 'mp3', 'title': '207 - Whitney Webb Returns', - 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'description': 'md5:75ef4316031df7b41ced4e7b987f79c6', 'upload_date': '20200705', - 'timestamp': 1593921600, - 'duration': 6425, + 'timestamp': 1593932400, + 'duration': 6454, 'series': 'The Tim Dillon Show', + 'thumbnail': 're:.+[.](png|jpe?g|webp)', } }, { 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', @@ -39,24 +42,47 @@ class ApplePodcastsIE(InfoExtractor): def _real_extract(self, url): episode_id = self._match_id(url) webpage = self._download_webpage(url, episode_id) - ember_data = self._parse_json(self._search_regex( - r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', - webpage, 'ember data'), episode_id) - ember_data = ember_data.get(episode_id) or ember_data - episode = ember_data['data']['attributes'] + episode_data = {} + ember_data = {} + # new page type 2021-11 + amp_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-media-api-cache-amp-podcasts"[^>]*>\s*({.+?})\s*<', + webpage, 'AMP data', default='{}'), episode_id, fatal=False) or {} + amp_data = try_get(amp_data, + lambda a: self._parse_json( + next(a[x] for x in iter(a) if episode_id in x), + episode_id), + dict) or {} + amp_data = amp_data.get('d') or [] + episode_data = try_get( + amp_data, + lambda a: next(x for x in a + if x['type'] == 'podcast-episodes' and x['id'] == episode_id), + dict) + if not episode_data: + # try pre 2021-11 page type: TODO: consider deleting if no longer used + ember_data = self._parse_json(self._search_regex( + r'(?s)id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) or {} + ember_data = ember_data.get(episode_id) or ember_data + episode_data = try_get(ember_data, lambda x: x['data'], dict) + episode = episode_data['attributes'] description = episode.get('description') or {} series = None - for inc in (ember_data.get('included') or []): + for inc in (amp_data or ember_data.get('included') or []): if inc.get('type') == 'media/podcast': series = try_get(inc, lambda x: x['attributes']['name']) + series = series or clean_html(get_element_by_class('podcast-header__identity', webpage)) return { 'id': episode_id, - 'title': episode['name'], + 'title': episode.get('name'), 'url': clean_podcast_url(episode['assetUrl']), 'description': description.get('standard') or description.get('short'), 'timestamp': parse_iso8601(episode.get('releaseDateTime')), 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), 'series': series, + 'thumbnail': self._og_search_thumbnail(webpage), + 'vcodec': 'none', } diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index d90fcb1..2ab3c1b 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -3,33 +3,37 @@ from __future__ import unicode_literals import re import json - from .common import InfoExtractor -from .youtube import YoutubeIE +from .youtube import YoutubeIE, YoutubeBaseInfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_HTTPError ) from ..utils import ( + bug_reports_message, clean_html, - determine_ext, dict_get, extract_attributes, ExtractorError, + get_element_by_id, HEADRequest, int_or_none, + join_nonempty, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, + orderedSet, parse_duration, parse_qs, - RegexNotFoundError, str_to_int, str_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, + urlhandle_detect_ext, + url_or_none ) @@ -61,7 +65,7 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, - 'upload_date': "20131222", + 'upload_date': '20131222', }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', @@ -147,8 +151,7 @@ class ArchiveOrgIE(InfoExtractor): # Archive.org metadata API doesn't clearly demarcate playlist entries # or subtitle tracks, so we get them from the embeddable player. - embed_page = self._download_webpage( - 'https://archive.org/embed/' + identifier, identifier) + embed_page = self._download_webpage(f'https://archive.org/embed/{identifier}', identifier) playlist = self._playlist_data(embed_page) entries = {} @@ -163,17 +166,17 @@ class ArchiveOrgIE(InfoExtractor): 'thumbnails': [], 'artist': p.get('artist'), 'track': p.get('title'), - 'subtitles': {}} + 'subtitles': {}, + } for track in p.get('tracks', []): if track['kind'] != 'subtitles': continue - entries[p['orig']][track['label']] = { - 'url': 'https://archive.org/' + track['file'].lstrip('/')} + 'url': 'https://archive.org/' + track['file'].lstrip('/') + } - metadata = self._download_json( - 'http://archive.org/metadata/' + identifier, identifier) + metadata = self._download_json('http://archive.org/metadata/' + identifier, identifier) m = metadata['metadata'] identifier = m['identifier'] @@ -186,7 +189,7 @@ class ArchiveOrgIE(InfoExtractor): 'license': m.get('licenseurl'), 'release_date': unified_strdate(m.get('date')), 'timestamp': unified_timestamp(dict_get(m, ['publicdate', 'addeddate'])), - 'webpage_url': 'https://archive.org/details/' + identifier, + 'webpage_url': f'https://archive.org/details/{identifier}', 'location': m.get('venue'), 'release_year': int_or_none(m.get('year'))} @@ -204,7 +207,7 @@ class ArchiveOrgIE(InfoExtractor): 'discnumber': int_or_none(f.get('disc')), 'release_year': int_or_none(f.get('year'))}) entry = entries[f['name']] - elif f.get('original') in entries: + elif traverse_obj(f, 'original', expected_type=str) in entries: entry = entries[f['original']] else: continue @@ -227,13 +230,12 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size')), 'protocol': 'https'}) - # Sort available formats by filesize for entry in entries.values(): - entry['formats'] = list(sorted(entry['formats'], key=lambda x: x.get('filesize', -1))) + self._sort_formats(entry['formats']) if len(entries) == 1: # If there's only one item, use it as the main info dict - only_video = entries[list(entries.keys())[0]] + only_video = next(iter(entries.values())) if entry_id: info = merge_dicts(only_video, info) else: @@ -258,19 +260,19 @@ class ArchiveOrgIE(InfoExtractor): class YoutubeWebArchiveIE(InfoExtractor): IE_NAME = 'web.archive:youtube' - IE_DESC = 'web.archive.org saved youtube videos' - _VALID_URL = r"""(?x)^ - (?:https?://)?web\.archive\.org/ - (?:web/)? - (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional - - (?:https?(?::|%3[Aa])//)? - (?: - (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL - |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url - ) - (?P[0-9A-Za-z_-]{11})(?:%26|\#|&|$) - """ + IE_DESC = 'web.archive.org saved youtube videos, "ytarchive:" prefix' + _VALID_URL = r'''(?x)(?:(?Pytarchive:)| + (?:https?://)?web\.archive\.org/ + (?:web/)?(?:(?P[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional + (?:https?(?::|%3[Aa])//)?(?: + (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + ) + )(?P[0-9A-Za-z_-]{11}) + (?(prefix) + (?::(?P[0-9]{14}))?$| + (?:%26|[#&]|$) + )''' _TESTS = [ { @@ -278,141 +280,395 @@ class YoutubeWebArchiveIE(InfoExtractor): 'info_dict': { 'id': 'aYAGB11YrSs', 'ext': 'webm', - 'title': 'Team Fortress 2 - Sandviches!' + 'title': 'Team Fortress 2 - Sandviches!', + 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf', + 'upload_date': '20110926', + 'uploader': 'Zeurel', + 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', + 'duration': 32, + 'uploader_id': 'Zeurel', + 'uploader_url': 'http://www.youtube.com/user/Zeurel' } - }, - { + }, { # Internal link 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', 'info_dict': { 'id': '97t7Xj_iBv0', 'ext': 'mp4', - 'title': 'How Flexible Machines Could Save The World' + 'title': 'Why Machines That Bend Are Better', + 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c', + 'upload_date': '20190312', + 'uploader': 'Veritasium', + 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', + 'duration': 771, + 'uploader_id': '1veritasium', + 'uploader_url': 'http://www.youtube.com/user/1veritasium' } - }, - { - # Video from 2012, webm format itag 45. + }, { + # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. + # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', 'info_dict': { 'id': 'AkhihxRKcrs', 'ext': 'webm', - 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)', + 'upload_date': '20120712', + 'duration': 398, + 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', + 'uploader_id': 'machinima', + 'uploader_url': 'http://www.youtube.com/user/machinima' } - }, - { - # Old flash-only video. Webpage title starts with "YouTube - ". + }, { + # FLV video. Video file URL does not provide itag information 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', 'info_dict': { 'id': 'jNQXAC9IVRw', - 'ext': 'unknown_video', - 'title': 'Me at the zoo' + 'ext': 'flv', + 'title': 'Me at the zoo', + 'upload_date': '20050423', + 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A', + 'duration': 19, + 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', + 'uploader_id': 'jawed', + 'uploader_url': 'http://www.youtube.com/user/jawed' } - }, - { - # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" - # Title has some weird unicode characters too. + }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', 'ext': 'flv', - 'title': '‪Madeon - Pop Culture (live mashup)‬‏' + 'title': 'Madeon - Pop Culture (live mashup)', + 'upload_date': '20110711', + 'uploader': 'Madeon', + 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w', + 'duration': 204, + 'description': 'md5:f7535343b6eda34a314eff8b85444680', + 'uploader_id': 'itsmadeon', + 'uploader_url': 'http://www.youtube.com/user/itsmadeon' } - }, - { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). - 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + }, { + # First capture is of dead video, second is the oldest from CDX response. + 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E', 'info_dict': { - 'id': 'kH-G_aIBlFw', + 'id': '1JYutPM8O6E', 'ext': 'mp4', - 'title': 'kH-G_aIBlFw' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { - # First capture is a 302 redirect intermediary page. - 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', + 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', + 'upload_date': '20160218', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 1236, + 'description': 'md5:21032bae736421e89c2edf36d1936947', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + } + }, { + # First capture of dead video, capture date in link links to dead capture. + 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E', 'info_dict': { - 'id': '0altSZ96U4M', + 'id': '6FPhZJGvf4E', 'ext': 'mp4', - 'title': '0altSZ96U4M' + 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', + 'upload_date': '20160219', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 798, + 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' }, 'expected_warnings': [ - 'unable to extract title', + r'unable to download capture webpage \(it may not be archived\)' ] - }, - { + }, { # Very old YouTube page, has - YouTube in title. + 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg', + 'info_dict': { + 'id': '-06-KB9XTzg', + 'ext': 'flv', + 'title': 'New Coin Hack!! 100% Safe!!' + } + }, { + 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8', + 'info_dict': { + 'id': 'dWW7qP423y8', + 'ext': 'mp4', + 'title': 'It\'s Bootleg AirPods Time.', + 'upload_date': '20211021', + 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'duration': 810, + 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'uploader': 'DankPods', + 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' + } + }, { + # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 + 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4', + 'info_dict': { + 'id': '6Dh-RL__uN4', + 'ext': 'mp4', + 'title': 'bitch lasagna', + 'upload_date': '20181005', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'duration': 135, + 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', + 'uploader': 'PewDiePie', + 'uploader_id': 'PewDiePie', + 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + } + }, { + 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M', + 'only_matching': True + }, { # Video not archived, only capture is unavailable video page 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', - 'only_matching': True, - }, - { # Encoded url + 'only_matching': True + }, { # Encoded url 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', - 'only_matching': True, - }, - { + 'only_matching': True + }, { 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', - 'only_matching': True, - } + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc:20050214000000', + 'only_matching': True + }, { + 'url': 'ytarchive:BaW_jenozKc', + 'only_matching': True + }, ] + _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE + _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|= 2: + # format response to make it easier to use + return list(dict(zip(res[0], v)) for v in res[1:]) + elif not isinstance(res, list) or len(res) != 0: + self.report_warning('Error while parsing CDX API response' + bug_reports_message()) + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + def _extract_webpage_title(self, webpage): + page_title = self._html_extract_title(webpage, default='') + # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. + return self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + + def _extract_metadata(self, video_id, webpage): + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + + initial_data_video = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), + expected_type=dict, get_all=False, default={}) + + video_details = traverse_obj( + player_response, 'videoDetails', expected_type=dict, get_all=False, default={}) + + microformats = traverse_obj( + player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={}) + + video_title = ( + video_details.get('title') + or YoutubeBaseInfoExtractor._get_text(microformats, 'title') + or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or self._extract_webpage_title(webpage) + or search_meta(['og:title', 'twitter:title', 'title'])) + + channel_id = str_or_none( + video_details.get('channelId') + or microformats.get('externalChannelId') + or search_meta('channelId') + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', # @b45a9e6 + webpage, 'channel id', default=None, group='id')) + channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + + duration = int_or_none( + video_details.get('lengthSeconds') + or microformats.get('lengthSeconds') + or parse_duration(search_meta('duration'))) + description = ( + video_details.get('shortDescription') + or YoutubeBaseInfoExtractor._get_text(microformats, 'description') + or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 + or search_meta(['description', 'og:description', 'twitter:description'])) + + uploader = video_details.get('author') + + # Uploader ID and URL + uploader_mobj = re.search( + r'', # @fd05024 + webpage) + if uploader_mobj is not None: + uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') + else: + # @a6211d2 + uploader_url = url_or_none(microformats.get('ownerProfileUrl')) + uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) + + upload_date = unified_strdate( + dict_get(microformats, ('uploadDate', 'publishDate')) + or search_meta(['uploadDate', 'datePublished']) + or self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + webpage, 'upload date', default=None)) + + return { + 'title': video_title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'duration': duration, + 'uploader_url': uploader_url, + 'uploader_id': uploader_id, + } + + def _extract_thumbnails(self, video_id): + try_all = 'thumbnails' in self._configuration_arg('check_all') + thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format( + webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server) + for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))] + + thumbnails = [] + for url in thumbnail_base_urls: + response = self._call_cdx_api( + video_id, url, filters=['mimetype:image/(?:webp|jpeg)'], + collapse=['urlkey'], query={'matchType': 'prefix'}) + if not response: + continue + thumbnails.extend( + { + 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'), + 'filesize': int_or_none(thumbnail_dict.get('length')), + 'preference': int_or_none(thumbnail_dict.get('length')) + } for thumbnail_dict in response) + if not try_all: + break + + self._remove_duplicate_formats(thumbnails) + return thumbnails + + def _get_capture_dates(self, video_id, url_date): + capture_dates = [] + # Note: CDX API will not find watch pages with extra params in the url. + response = self._call_cdx_api( + video_id, f'https://www.youtube.com/watch?v={video_id}', + filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] + all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + + # Prefer the new polymer UI captures as we support extracting more metadata from them + # WBM captures seem to all switch to this layout ~July 2020 + modern_captures = [x for x in all_captures if x >= 20200701000000] + if modern_captures: + capture_dates.append(modern_captures[0]) + capture_dates.append(url_date) + if all_captures: + capture_dates.append(all_captures[0]) + + if 'captures' in self._configuration_arg('check_all'): + capture_dates.extend(modern_captures + all_captures) + + # Fallbacks if any of the above fail + capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE]) + return orderedSet(filter(None, capture_dates)) def _real_extract(self, url): - video_id = self._match_id(url) - title = video_id # if we are not able get a title - - def _extract_title(webpage): - page_title = self._html_search_regex( - r'([^<]*)', webpage, 'title', fatal=False) or '' - # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. - try: - page_title = self._html_search_regex( - r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', - page_title, 'title', default='') - except RegexNotFoundError: - page_title = None - - if not page_title: - self.report_warning('unable to extract title', video_id=video_id) - return - return page_title - - # If the video is no longer available, the oldest capture may be one before it was removed. - # Setting the capture date in url to early date seems to redirect to earliest capture. - webpage = self._download_webpage( - 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, - video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') - if webpage: - title = _extract_title(webpage) or title - - # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 - internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + video_id, url_date, url_date_2 = self._match_valid_url(url).group('id', 'date', 'date2') + url_date = url_date or url_date_2 + + urlh = None try: - video_file_webpage = self._request_webpage( - HEADRequest(internal_fake_url), video_id, - note='Fetching video file url', expected_status=True) + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - raise ExtractorError( - 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org', expected=True) - raise - video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) - video_file_url_qs = parse_qs(video_file_url) - - # Attempt to recover any ext & format info from playback url - format = {'url': video_file_url} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: # Naughty access but it works - format.update(YoutubeIE._formats[itag]) - format.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = mimetype2ext(mime) or determine_ext(video_file_url) - format.update({'ext': ext}) - return { - 'id': video_id, - 'title': title, - 'formats': [format], - 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) - } + else: + raise + + capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) + self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) + info = {'id': video_id} + for capture in capture_dates: + webpage = self._download_webpage( + (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id), + video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)', + note='Downloading capture webpage') + current_info = self._extract_metadata(video_id, webpage or '') + # Try avoid getting deleted video metadata + if current_info.get('title'): + info = merge_dicts(info, current_info) + if 'captures' not in self._configuration_arg('check_all'): + break + + info['thumbnails'] = self._extract_thumbnails(video_id) + + if urlh: + url = compat_urllib_parse_unquote(urlh.geturl()) + video_file_url_qs = parse_qs(url) + # Attempt to recover any ext & format info from playback url & response headers + format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = (mimetype2ext(mime) + or urlhandle_detect_ext(urlh) + or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) + format.update({'ext': ext}) + info['formats'] = [format] + if not info.get('duration'): + info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + + if not info.get('title'): + info['title'] = video_id + return info diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 5a9b818..8880e5c 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -124,8 +124,7 @@ class ArcPublishingIE(InfoExtractor): formats.extend(smil_formats) elif stream_type in ('ts', 'hls'): m3u8_formats = self._extract_m3u8_formats( - s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False) + s_url, uuid, 'mp4', live=is_live, m3u8_id='hls', fatal=False) if all([f.get('acodec') == 'none' for f in m3u8_formats]): continue for f in m3u8_formats: @@ -158,7 +157,7 @@ class ArcPublishingIE(InfoExtractor): return { 'id': uuid, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), 'description': try_get(video, lambda x: x['subheadlines']['basic']), 'formats': formats, diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 048d30f..7ea339b 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -280,7 +280,7 @@ class ARDMediathekIE(ARDMediathekBaseIE): info.update({ 'id': video_id, - 'title': self._live_title(title) if info.get('is_live') else title, + 'title': title, 'description': description, 'thumbnail': thumbnail, }) @@ -376,9 +376,24 @@ class ARDIE(InfoExtractor): formats.append(f) self._sort_formats(formats) + _SUB_FORMATS = ( + ('./dataTimedText', 'ttml'), + ('./dataTimedTextNoOffset', 'ttml'), + ('./dataTimedTextVtt', 'vtt'), + ) + + subtitles = {} + for subsel, subext in _SUB_FORMATS: + for node in video_node.findall(subsel): + subtitles.setdefault('de', []).append({ + 'url': node.attrib['url'], + 'ext': subext, + }) + return { 'id': xpath_text(video_node, './videoId', default=display_id), 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, 'title': video_node.find('./title').text, 'duration': parse_duration(video_node.find('./duration').text), @@ -388,7 +403,14 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?Pplayer|live|video|sendung|sammlung)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P[^/]+)/)? + (?:player|live|video|(?Psendung|sammlung))/ + (?:(?P(?(playlist)[^?#]+?|[^?#]+))/)? + (?P(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+) + (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +425,25 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, + }, { + 'url': 'https://www.ardmediathek.de/sendung/beforeigners/beforeigners/staffel-1/Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw/1', + 'playlist_count': 6, + 'info_dict': { + 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL2JlZm9yZWlnbmVycw', + 'title': 'beforeigners/beforeigners/staffel-1', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +467,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -522,23 +569,16 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): break pageNumber = pageNumber + 1 - return self.playlist_result(entries, playlist_title=display_id) + return self.playlist_result(entries, playlist_id, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + video_id, display_id, playlist_type, client, season_number = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client', 'season') + display_id, client = display_id or video_id, client or 'ard' + + if playlist_type: + # TODO: Extract only specified season + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +614,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index c0032fc..050c252 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -7,6 +7,7 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + format_field, float_or_none, int_or_none, parse_iso8601, @@ -92,7 +93,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': self._BASE_URL + '/?channel=' + channel_id if channel_id else None, + 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index 296b169..c2f2c1b 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -12,6 +12,7 @@ from ..utils import ( int_or_none, parse_qs, qualities, + strip_or_none, try_get, unified_strdate, url_or_none, @@ -137,6 +138,7 @@ class ArteTVIE(ArteTVBaseIE): break else: lang_pref = -1 + format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) media_type = f.get('mediaType') if media_type == 'hls': @@ -144,14 +146,17 @@ class ArteTVIE(ArteTVBaseIE): format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: - m3u8_format['language_preference'] = lang_pref + m3u8_format.update({ + 'language_preference': lang_pref, + 'format_note': format_note, + }) formats.extend(m3u8_formats) continue format = { 'format_id': format_id, 'language_preference': lang_pref, - 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'format_note': format_note, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), @@ -253,3 +258,44 @@ class ArteTVPlaylistIE(ArteTVBaseIE): title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') return self.playlist_result(entries, playlist_id, title, description) + + +class ArteTVCategoryIE(ArteTVBaseIE): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?P[\w-]+(?:/[\w-]+)*)/?\s*$' % ArteTVBaseIE._ARTE_LANGUAGES + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/politics-and-society/', + 'info_dict': { + 'id': 'politics-and-society', + 'title': 'Politics and society', + 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', + }, + 'playlist_mincount': 13, + }, + ] + + @classmethod + def suitable(cls, url): + return ( + not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) + and super(ArteTVCategoryIE, cls).suitable(url)) + + def _real_extract(self, url): + lang, playlist_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, playlist_id) + + items = [] + for video in re.finditer( + r']*?href\s*=\s*(?P"|\'|\b)(?Phttps?://www\.arte\.tv/%s/videos/[\w/-]+)(?P=q)' % lang, + webpage): + video = video.group('url') + if video == url: + continue + if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): + items.append(video) + + title = (self._og_search_title(webpage, default=None) + or self._html_search_regex(r']*>([^<]+)', default=None)) + title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + + return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, + description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 75a6329..7f1940f 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -181,8 +181,7 @@ class AsianCrushPlaylistIE(AsianCrushBaseIE): 'title', default=None) or self._og_search_title( webpage, default=None) or self._html_search_meta( 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + default=None) or self._html_extract_title(webpage) if title: title = re.sub(r'\s*\|\s*.+?$', '', title) diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 8143eb4..465af4e 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { @@ -40,9 +37,6 @@ class AtresPlayerIE(InfoExtractor): ] _API_BASE = 'https://api.atresplayer.com/' - def _real_initialize(self): - self._login() - def _handle_error(self, e, code): if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: error = self._parse_json(e.cause.read(), None) @@ -51,11 +45,7 @@ class AtresPlayerIE(InfoExtractor): raise ExtractorError(error['error_description'], expected=True) raise - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._request_webpage( self._API_BASE + 'login', None, 'Downloading login page') diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 7c30cfc..481a097 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -8,6 +8,7 @@ from ..utils import ( float_or_none, jwt_encode_hs256, try_get, + ExtractorError, ) @@ -94,6 +95,11 @@ class ATVAtIE(InfoExtractor): }) video_id, videos_data = list(videos['data'].items())[0] + error_msg = try_get(videos_data, lambda x: x['error']['title']) + if error_msg == 'Geo check failed': + self.raise_geo_restricted(error_msg) + elif error_msg: + raise ExtractorError(error_msg) entries = [ self._extract_video_info(url, contentResource[video['id']], video) for video in videos_data] diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index cc77713..19775cf 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -14,7 +14,7 @@ from ..utils import ( class AudiomackIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/song/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:song/|(?=.+/song/))(?P[\w/-]+)' IE_NAME = 'audiomack' _TESTS = [ # hosted on audiomack @@ -29,6 +29,7 @@ class AudiomackIE(InfoExtractor): } }, # audiomack wrapper around soundcloud song + # Needs new test URL. { 'add_ie': ['Soundcloud'], 'url': 'http://www.audiomack.com/song/hip-hop-daily/black-mamba-freestyle', @@ -39,15 +40,16 @@ class AudiomackIE(InfoExtractor): 'title': 'Black Mamba Freestyle [Prod. By Danny Wolf]', 'uploader': 'ILOVEMAKONNEN', 'upload_date': '20160414', - } + }, + 'skip': 'Song has been removed from the site', }, ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/song/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/song/', '/') # Request the extended version of the api for extra fields like artist and title api_response = self._download_json( @@ -73,13 +75,13 @@ class AudiomackIE(InfoExtractor): class AudiomackAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?audiomack\.com/album/(?P[\w/-]+)' + _VALID_URL = r'https?://(?:www\.)?audiomack\.com/(?:album/|(?=.+/album/))(?P[\w/-]+)' IE_NAME = 'audiomack:album' _TESTS = [ # Standard album playlist { 'url': 'http://www.audiomack.com/album/flytunezcom/tha-tour-part-2-mixtape', - 'playlist_count': 15, + 'playlist_count': 11, 'info_dict': { 'id': '812251', @@ -95,24 +97,27 @@ class AudiomackAlbumIE(InfoExtractor): }, 'playlist': [{ 'info_dict': { - 'title': 'PPP (Pistol P Project) - 9. Heaven or Hell (CHIMACA) ft Zuse (prod by DJ FU)', - 'id': '837577', + 'title': 'PPP (Pistol P Project) - 8. Real (prod by SYK SENSE )', + 'id': '837576', + 'ext': 'mp3', + 'uploader': 'Lil Herb a.k.a. G Herbo', + } + }, { + 'info_dict': { + 'title': 'PPP (Pistol P Project) - 10. 4 Minutes Of Hell Part 4 (prod by DY OF 808 MAFIA)', + 'id': '837580', 'ext': 'mp3', 'uploader': 'Lil Herb a.k.a. G Herbo', } }], - 'params': { - 'playliststart': 9, - 'playlistend': 9, - } } ] def _real_extract(self, url): - # URLs end with [uploader name]/[uploader title] + # URLs end with [uploader name]/album/[uploader title] # this title is whatever the user types in, and is rarely # the proper song title. Real metadata is in the api response - album_url_tag = self._match_id(url) + album_url_tag = self._match_id(url).replace('/album/', '/') result = {'_type': 'playlist', 'entries': []} # There is no one endpoint for album metadata - instead it is included/repeated in each song's metadata # Therefore we don't know how many songs the album has and must infi-loop until failure @@ -134,7 +139,7 @@ class AudiomackAlbumIE(InfoExtractor): # Pull out the album metadata and add to result (if it exists) for resultkey, apikey in [('id', 'album_id'), ('title', 'album_title')]: if apikey in api_response and resultkey not in result: - result[resultkey] = api_response[apikey] + result[resultkey] = compat_str(api_response[apikey]) song_id = url_basename(api_response['url']).rpartition('.')[0] result['entries'].append({ 'id': compat_str(api_response.get('id', song_id)), diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index 22cc10d..f5e559c 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -9,6 +9,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + format_field, int_or_none, parse_iso8601, smuggle_url, @@ -41,9 +42,9 @@ class AWAANBaseIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index fee640e..0168340 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -11,11 +11,12 @@ class AZMedienIE(InfoExtractor): IE_DESC = 'AZ Medien videos' _VALID_URL = r'''(?x) https?:// - (?:www\.)? + (?:www\.|tv\.)? (?P telezueri\.ch| telebaern\.tv| - telem1\.ch + telem1\.ch| + tvo-online\.ch )/ [^/]+/ (?P @@ -30,7 +31,7 @@ class AZMedienIE(InfoExtractor): ''' _TESTS = [{ - 'url': 'https://www.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', + 'url': 'https://tv.telezueri.ch/sonntalk/bundesrats-vakanzen-eu-rahmenabkommen-133214569', 'info_dict': { 'id': '1_anruz3wy', 'ext': 'mp4', @@ -38,6 +39,9 @@ class AZMedienIE(InfoExtractor): 'uploader_id': 'TVOnline', 'upload_date': '20180930', 'timestamp': 1538328802, + 'view_count': int, + 'thumbnail': 'http://cfvod.kaltura.com/p/1719221/sp/171922100/thumbnail/entry_id/1_anruz3wy/version/100031', + 'duration': 1930 }, 'params': { 'skip_download': True, diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py new file mode 100644 index 0000000..3d4d36e --- /dev/null +++ b/hypervideo_dl/extractor/banbye.py @@ -0,0 +1,153 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import math + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) +from ..utils import ( + format_field, + InAdvancePagedList, + traverse_obj, + unified_timestamp, +) + + +class BanByeBaseIE(InfoExtractor): + _API_BASE = 'https://api.banbye.com' + _CDN_BASE = 'https://cdn.banbye.com' + _VIDEO_BASE = 'https://banbye.com/watch' + + @staticmethod + def _extract_playlist_id(url, param='playlist'): + return compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get(param, [None])[0] + + def _extract_playlist(self, playlist_id): + data = self._download_json(f'{self._API_BASE}/playlists/{playlist_id}', playlist_id) + return self.playlist_result([ + self.url_result(f'{self._VIDEO_BASE}/{video_id}', BanByeIE) + for video_id in data['videoIds']], playlist_id, data.get('name')) + + +class BanByeIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', + 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', + 'info_dict': { + 'id': 'v_ytfmvkVYLE8T', + 'ext': 'mp4', + 'title': 'md5:5ec098f88a0d796f987648de6322ba0f', + 'description': 'md5:4d94836e73396bc18ef1fa0f43e5a63a', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'timestamp': 1647604800, + 'upload_date': '20220318', + 'duration': 1931, + 'thumbnail': r're:https?://.*\.webp', + 'tags': 'count:5', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, + }, { + 'url': 'https://banbye.com/watch/v_2JjQtqjKUE_F?playlistId=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url, 'playlistId') + + if self._yes_playlist(playlist_id, video_id): + return self._extract_playlist(playlist_id) + + data = self._download_json(f'{self._API_BASE}/videos/{video_id}', video_id) + thumbnails = [{ + 'id': f'{quality}p', + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', + } for quality in [48, 96, 144, 240, 512, 1080]] + formats = [{ + 'format_id': f'http-{quality}p', + 'quality': quality, + 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', + } for quality in data['quality']] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': data.get('title'), + 'description': data.get('desc'), + 'uploader': traverse_obj(data, ('channel', 'name')), + 'channel_id': data.get('channelId'), + 'channel_url': format_field(data, 'channelId', 'https://banbye.com/channel/%s'), + 'timestamp': unified_timestamp(data.get('publishedAt')), + 'duration': data.get('duration'), + 'tags': data.get('tags'), + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': data.get('likes'), + 'dislike_count': data.get('dislikes'), + 'view_count': data.get('views'), + 'comment_count': data.get('commentCount'), + } + + +class BanByeChannelIE(BanByeBaseIE): + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?channel/(?P\w+)' + _TESTS = [{ + 'url': 'https://banbye.com/channel/ch_wrealu24', + 'info_dict': { + 'title': 'wRealu24', + 'id': 'ch_wrealu24', + 'description': 'md5:da54e48416b74dfdde20a04867c0c2f6', + }, + 'playlist_mincount': 791, + }, { + 'url': 'https://banbye.com/channel/ch_wrealu24?playlist=p_Ld82N6gBw_OJ', + 'info_dict': { + 'title': 'Krzysztof Karoń', + 'id': 'p_Ld82N6gBw_OJ', + }, + 'playlist_count': 9, + }] + _PAGE_SIZE = 100 + + def _real_extract(self, url): + channel_id = self._match_id(url) + playlist_id = self._extract_playlist_id(url) + + if playlist_id: + return self._extract_playlist(playlist_id) + + def page_func(page_num): + data = self._download_json(f'{self._API_BASE}/videos', channel_id, query={ + 'channelId': channel_id, + 'sort': 'new', + 'limit': self._PAGE_SIZE, + 'offset': page_num * self._PAGE_SIZE, + }, note=f'Downloading page {page_num+1}') + return [ + self.url_result(f"{self._VIDEO_BASE}/{video['_id']}", BanByeIE) + for video in data['items'] + ] + + channel_data = self._download_json(f'{self._API_BASE}/channels/{channel_id}', channel_id) + entries = InAdvancePagedList( + page_func, + math.ceil(channel_data['videoCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + + return self.playlist_result( + entries, channel_id, channel_data.get('name'), channel_data.get('description')) diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index d672859..f1bcdef 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index b664145..745055e 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -183,6 +183,7 @@ class BandcampIE(InfoExtractor): 'format_note': f.get('description'), 'filesize': parse_filesize(f.get('size_mb')), 'vcodec': 'none', + 'acodec': format_id.split('-')[0], }) self._sort_formats(formats) @@ -212,7 +213,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com(?!/music)(?:/album/(?P[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P[^.]+)\.)?bandcamp\.com/album/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -257,14 +258,6 @@ class BandcampAlbumIE(BandcampIE): 'id': 'hierophany-of-the-open-grave', }, 'playlist_mincount': 9, - }, { - 'url': 'http://dotscale.bandcamp.com', - 'info_dict': { - 'title': 'Loom', - 'id': 'dotscale', - 'uploader_id': 'dotscale', - }, - 'playlist_mincount': 7, }, { # with escaped quote in title 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', @@ -391,41 +384,63 @@ class BandcampWeeklyIE(BandcampIE): } -class BandcampMusicIE(InfoExtractor): - _VALID_URL = r'https?://(?P[^/]+)\.bandcamp\.com/music' +class BandcampUserIE(InfoExtractor): + IE_NAME = 'Bandcamp:user' + _VALID_URL = r'https?://(?!www\.)(?P[^.]+)\.bandcamp\.com(?:/music)?/?(?:[#?]|$)' + _TESTS = [{ + # Type 1 Bandcamp user page. + 'url': 'https://adrianvonziegler.bandcamp.com', + 'info_dict': { + 'id': 'adrianvonziegler', + 'title': 'Discography of adrianvonziegler', + }, + 'playlist_mincount': 23, + }, { + # Bandcamp user page with only one album + 'url': 'http://dotscale.bandcamp.com', + 'info_dict': { + 'id': 'dotscale', + 'title': 'Discography of dotscale' + }, + 'playlist_count': 1, + }, { + # Type 2 Bandcamp user page. + 'url': 'https://nightcallofficial.bandcamp.com', + 'info_dict': { + 'id': 'nightcallofficial', + 'title': 'Discography of nightcallofficial', + }, + 'playlist_count': 4, + }, { 'url': 'https://steviasphere.bandcamp.com/music', 'playlist_mincount': 47, 'info_dict': { 'id': 'steviasphere', + 'title': 'Discography of steviasphere', }, }, { 'url': 'https://coldworldofficial.bandcamp.com/music', 'playlist_mincount': 10, 'info_dict': { 'id': 'coldworldofficial', + 'title': 'Discography of coldworldofficial', }, }, { 'url': 'https://nuclearwarnowproductions.bandcamp.com/music', 'playlist_mincount': 399, 'info_dict': { 'id': 'nuclearwarnowproductions', + 'title': 'Discography of nuclearwarnowproductions', }, - } - ] - - _TYPE_IE_DICT = { - 'album': BandcampAlbumIE.ie_key(), - 'track': BandcampIE.ie_key() - } + }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - items = re.findall(r'href\=\"\/(?P(?Palbum|track)+/[^\"]+)', webpage) - entries = [ - self.url_result( - f'https://{id}.bandcamp.com/{item[0]}', - ie=self._TYPE_IE_DICT[item[1]]) - for item in items] - return self.playlist_result(entries, id) + uploader = self._match_id(url) + webpage = self._download_webpage(url, uploader) + + discography_data = (re.findall(r'
  • ]+>\s*]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) + + return self.playlist_from_matches( + discography_data, uploader, f'Discography of {uploader}', getter=lambda x: urljoin(url, x)) diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 4e2dcd7..29ad7de 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import ( compat_etree_Element, compat_HTTPError, compat_str, + compat_urllib_error, compat_urlparse, ) from ..utils import ( @@ -38,7 +39,7 @@ from ..utils import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})' + _ID_REGEX = r'(?:[pbml][\da-z]{7}|w[\da-z]{7,14})' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ @@ -263,11 +264,7 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading signin page') @@ -293,9 +290,6 @@ class BBCCoUkIE(InfoExtractor): 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -394,9 +388,17 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_mpd_formats( href, programme_id, mpd_id=format_id, fatal=False)) elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + # TODO: let expected_status be passed into _extract_xxx_formats() instead + try: + fmts = self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + except ExtractorError as e: + if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + and e.exc_info[1].code in (403, 404)): + raise + fmts = [] + formats.extend(fmts) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) @@ -451,9 +453,10 @@ class BBCCoUkIE(InfoExtractor): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +466,17 @@ class BBCCoUkIE(InfoExtractor): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []).extend(subformats) + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise @@ -774,21 +786,33 @@ class BBCIE(BBCCoUkIE): 'timestamp': 1437785037, 'upload_date': '20150725', }, + }, { + # video with window.__INITIAL_DATA__ and value as JSON string + 'url': 'https://www.bbc.com/news/av/world-europe-59468682', + 'info_dict': { + 'id': 'p0b71qth', + 'ext': 'mp4', + 'title': 'Why France is making this woman a national hero', + 'description': 'md5:7affdfab80e9c3a1f976230a1ff4d5e4', + 'thumbnail': r're:https?://.+/.+\.jpg', + 'timestamp': 1638230731, + 'upload_date': '20211130', + }, }, { # single video article embedded with data-media-vpid 'url': 'http://www.bbc.co.uk/sport/rowing/35908187', 'only_matching': True, }, { + # bbcthreeConfig 'url': 'https://www.bbc.co.uk/bbcthree/clip/73d0bbd0-abc3-4cea-b3c0-cdae21905eb1', 'info_dict': { 'id': 'p06556y7', 'ext': 'mp4', - 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', - 'description': 'md5:4b7dfd063d5a789a1512e99662be3ddd', + 'title': 'Things Not To Say to people that live on council estates', + 'description': "From being labelled a 'chav', to the presumption that they're 'scroungers', people who live on council estates encounter all kinds of prejudices and false assumptions about themselves, their families, and their lifestyles. Here, eight people discuss the common statements, misconceptions, and clichés that they're tired of hearing.", + 'duration': 360, + 'thumbnail': r're:https?://.+/.+\.jpg', }, - 'params': { - 'skip_download': True, - } }, { # window.__PRELOADED_STATE__ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl', @@ -882,9 +906,8 @@ class BBCIE(BBCCoUkIE): playlist_title = json_ld_info.get('title') if not playlist_title: - playlist_title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'(.+?)', webpage, 'playlist title', default=None) + playlist_title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'playlist title', default=None)) if playlist_title: playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() @@ -1161,9 +1184,16 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), playlist_id, fatal=False) + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*("{.+?}")\s*;', webpage, + 'quoted preload state', default=None) + if initial_data is None: + initial_data = self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?})\s*;', webpage, + 'preload state', default={}) + else: + initial_data = self._parse_json(initial_data or '"{}"', playlist_id, fatal=False) + initial_data = self._parse_json(initial_data, playlist_id, fatal=False) if initial_data: def parse_media(media): if not media: @@ -1204,7 +1234,10 @@ class BBCIE(BBCCoUkIE): if name == 'media-experience': parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) elif name == 'article': - for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + for block in (try_get(resp, + (lambda x: x['data']['blocks'], + lambda x: x['data']['content']['model']['blocks'],), + list) or []): if block.get('type') != 'media': continue parse_media(block.get('model')) diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 8fbabe7..717fff3 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,32 +1,45 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, -) + from ..utils import ( int_or_none, - parse_qs, + traverse_obj, + try_get, unified_timestamp, ) class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P\d+)' _TESTS = [{ - # api/v6 v1 - 'url': 'http://beeg.com/5416503', - 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'url': 'https://beeg.com/-0983946056129650', + 'md5': '51d235147c4627cfce884f844293ff88', 'info_dict': { - 'id': '5416503', + 'id': '0983946056129650', 'ext': 'mp4', - 'title': 'Sultry Striptease', - 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', - 'timestamp': 1391813355, - 'upload_date': '20140207', - 'duration': 383, + 'title': 'sucked cock and fucked in a private plane', + 'duration': 927, 'tags': list, 'age_limit': 18, + 'upload_date': '20220131', + 'timestamp': 1643656455, + 'display_id': 2540839, + } + }, { + 'url': 'https://beeg.com/-0599050563103750?t=4-861', + 'md5': 'bd8b5ea75134f7f07fad63008db2060e', + 'info_dict': { + 'id': '0599050563103750', + 'ext': 'mp4', + 'title': 'Bad Relatives', + 'duration': 2060, + 'tags': list, + 'age_limit': 18, + 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', + 'timestamp': 1643623200, + 'display_id': 2569965, + 'upload_date': '20220131', } }, { # api/v6 v2 @@ -36,12 +49,6 @@ class BeegIE(InfoExtractor): # api/v6 v2 w/o t 'url': 'https://beeg.com/1277207756', 'only_matching': True, - }, { - 'url': 'https://beeg.porn/video/5416503', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/5416503', - 'only_matching': True, }] def _real_extract(self, url): @@ -49,68 +56,38 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - beeg_version = self._search_regex( - r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', - default='1546225636701') + video = self._download_json( + 'https://store.externulls.com/facts/file/%s' % video_id, + video_id, 'Downloading JSON for %s' % video_id) - if len(video_id) >= 10: - query = { - 'v': 2, - } - qs = parse_qs(url) - t = qs.get('t', [''])[0].split('-') - if len(t) > 1: - query.update({ - 's': t[0], - 'e': t[1], - }) - else: - query = {'v': 1} + fc_facts = video.get('fc_facts') + first_fact = {} + for fact in fc_facts: + if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']): + first_fact = fact - for api_path in ('', 'api.'): - video = self._download_json( - 'https://%sbeeg.com/api/v6/%s/video/%s' - % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.', query=query) - if video: - break + resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources') formats = [] - for format_id, video_url in video.items(): - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - if not height: + for format_id, video_uri in resources.items(): + if not video_uri: continue - formats.append({ - 'url': self._proto_relative_url( - video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), - 'format_id': format_id, - 'height': int(height), - }) - self._sort_formats(formats) - - title = video['title'] - video_id = compat_str(video.get('id') or video_id) - display_id = video.get('code') - description = video.get('desc') - series = video.get('ps_name') + height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None)) + current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height)) + for f in current_formats: + f['height'] = height + formats.extend(current_formats) - timestamp = unified_timestamp(video.get('date')) - duration = int_or_none(video.get('duration')) - - tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'timestamp': timestamp, - 'duration': duration, - 'tags': tags, + 'display_id': first_fact.get('id'), + 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), + 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), + 'timestamp': unified_timestamp(first_fact.get('fc_created')), + 'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))), + 'tags': traverse_obj(video, ('tags', ..., 'tg_name')), 'formats': formats, 'age_limit': self._rta_search(webpage), } diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py new file mode 100644 index 0000000..ddf76ac --- /dev/null +++ b/hypervideo_dl/extractor/bigo.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁‍♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if not isinstance(info_raw, dict): + raise ExtractorError('Received invalid JSON data') + if info_raw.get('code'): + raise ExtractorError( + 'Bigo says: %s (code %s)' % (info_raw.get('msg'), info_raw.get('code')), expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic') or info.get('nick_name') or user_id, + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 8d66b43..909f7f8 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,5 +1,6 @@ # coding: utf-8 +import base64 import hashlib import itertools import functools @@ -14,19 +15,21 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + filter_dict, int_or_none, float_or_none, + mimetype2ext, parse_iso8601, traverse_obj, - try_get, + parse_count, smuggle_url, srt_subtitles_timecode, str_or_none, - str_to_int, strip_jsonp, unified_timestamp, unsmuggle_url, urlencode_postdata, + url_or_none, OnDemandPagedList ) @@ -50,16 +53,14 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.com/video/av1074402/', 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402', - 'ext': 'flv', + 'id': '1074402_part1', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', + 'uploader_id': '156160', + 'uploader': '菊子桑', + 'upload_date': '20140420', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, 'timestamp': 1398012678, - 'upload_date': '20140420', - 'thumbnail': r're:^https?://.+\.jpg', - 'uploader': '菊子桑', - 'uploader_id': '156160', }, }, { # Tested in BiliBiliBangumiIE @@ -73,49 +74,27 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', 'md5': '3f721ad1e75030cc06faf73587cfec57', 'info_dict': { - 'id': '100643', + 'id': '100643_part1', 'ext': 'mp4', 'title': 'CHAOS;CHILD', 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', }, 'skip': 'Geo-restricted to China', }, { - # Title with double quotes 'url': 'http://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802', + 'id': '8903802_part1', + 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', + 'upload_date': '20170301', 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'timestamp': 1488382634, + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + }, + 'params': { + 'skip_download': True, }, - 'playlist': [{ - 'info_dict': { - 'id': '8903802_part1', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': '8903802_part2', - 'ext': 'flv', - 'title': '阿滴英文|英文歌分享#6 "Closer', - 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', - 'uploader': '阿滴英文', - 'uploader_id': '65880958', - 'timestamp': 1488382634, - 'upload_date': '20170301', - }, - 'params': { - 'skip_download': True, - }, - }] }, { # new BV video id format 'url': 'https://www.bilibili.com/video/BV1JE411F741', @@ -150,6 +129,7 @@ class BiliBiliIE(InfoExtractor): av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) video_id = av_id + info = {} anime_id = mobj.group('anime_id') page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) @@ -201,66 +181,95 @@ class BiliBiliIE(InfoExtractor): } headers.update(self.geo_verification_headers()) + video_info = self._parse_json( + self._search_regex(r'window.__playinfo__\s*=\s*({.+?})', webpage, 'video info', default=None) or '{}', + video_id, fatal=False) + video_info = video_info.get('data') or {} + + durl = traverse_obj(video_info, ('dash', 'video')) + audios = traverse_obj(video_info, ('dash', 'audio')) or [] entries = [] RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') for num, rendition in enumerate(RENDITIONS, start=1): payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue + video_info = self._download_json( + 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page', + headers=headers, fatal=num == len(RENDITIONS)) + if not video_info: + continue - if 'durl' not in video_info: + if not durl and 'durl' not in video_info: if num < len(RENDITIONS): continue self._report_error(video_info) - for idx, durl in enumerate(video_info['durl']): - formats = [{ - 'url': durl['url'], - 'filesize': int_or_none(durl['size']), - }] - for backup_url in durl.get('backup_url', []): + formats = [] + for idx, durl in enumerate(durl or video_info['durl']): + formats.append({ + 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), + 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), + 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), + 'width': int_or_none(durl.get('width')), + 'height': int_or_none(durl.get('height')), + 'vcodec': durl.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), + 'filesize': int_or_none(durl.get('size')), + }) + for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: formats.append({ 'url': backup_url, - # backup URLs have lower priorities 'quality': -2 if 'hd.mp4' in backup_url else -3, }) - for a_format in formats: - a_format.setdefault('http_headers', {}).update({ - 'Referer': url, + for audio in audios: + formats.append({ + 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), + 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), + 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), + 'width': int_or_none(audio.get('width')), + 'height': int_or_none(audio.get('height')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + }) + for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: + formats.append({ + 'url': backup_url, + # backup URLs have lower priorities + 'quality': -3, }) - self._sort_formats(formats) - - entries.append({ - 'id': '%s_part%s' % (video_id, idx), - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - }) + info.update({ + 'id': video_id, + 'duration': float_or_none(durl.get('length'), 1000), + 'formats': formats, + 'http_headers': { + 'Referer': url, + }, + }) break - title = self._html_search_regex( - (r']+\btitle=(["\'])(?P(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', - group='title') + self._sort_formats(formats) + + title = self._html_search_regex(( + r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)', + r'(?s)<h1[^>]*>(?P<content>.+?)</h1>', + self._meta_regex('title') + ), webpage, 'title', group='content', fatal=False) # Get part title for anthologies if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video - part_title = try_get( - self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), - lambda x: x['data'][int(page_id) - 1]['part']) - title = part_title or title + # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. + part_info = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', + video_id, note='Extracting videos in anthology'), 'data', expected_type=list) + title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( @@ -270,15 +279,15 @@ class BiliBiliIE(InfoExtractor): thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript - info = { - 'id': str(video_id) if page_id is None else '%s_part%s' % (video_id, page_id), + info.update({ + 'id': f'{video_id}_part{page_id or 1}', 'cid': cid, 'title': title, 'description': description, 'timestamp': timestamp, 'thumbnail': thumbnail, 'duration': float_or_none(video_info.get('timelength'), scale=1000), - } + }) uploader_mobj = re.search( r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<', @@ -299,7 +308,7 @@ class BiliBiliIE(InfoExtractor): video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), } - entries[0]['subtitles'] = { + info['subtitles'] = { 'danmaku': [{ 'ext': 'xml', 'url': f'https://comment.bilibili.com/{cid}.xml', @@ -334,19 +343,18 @@ class BiliBiliIE(InfoExtractor): entry['id'] = '%s_part%d' % (video_id, (idx + 1)) return { - '_type': 'multi_video', 'id': str(video_id), 'bv_id': bv_id, 'title': title, 'description': description, - 'entries': entries, **info, **top_level_info } def _extract_anthology_entries(self, bv_id, video_id, webpage): title = self._html_search_regex( (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>'), webpage, 'title', + r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', + r'<title>(?P<title>.+?)'), webpage, 'title', group='title') json_data = self._download_json( f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', @@ -376,8 +384,10 @@ class BiliBiliIE(InfoExtractor): replies = traverse_obj( self._download_json( f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}'), - ('data', 'replies')) or [] + video_id, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return for children in map(self._get_all_children, replies): yield from children @@ -477,9 +487,9 @@ class BilibiliChannelIE(InfoExtractor): data = self._download_json( self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] - max_count = max_count or try_get(data, lambda x: x['page']['count']) + max_count = max_count or traverse_obj(data, ('page', 'count')) - entries = try_get(data, lambda x: x['list']['vlist']) + entries = traverse_obj(data, ('list', 'vlist')) if not entries: return for entry in entries: @@ -517,7 +527,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url, query, query={'Search_key': query, 'pn': page_num}, note='Extracting results from page %s of %s' % (page_num, num_pages)) - video_list = try_get(parsed_json, lambda x: x['data']['archives'], list) + video_list = traverse_obj(parsed_json, ('data', 'archives'), expected_type=list) if not video_list: raise ExtractorError('Failed to retrieve video list for page %d' % page_num) @@ -547,7 +557,7 @@ class BilibiliCategoryIE(InfoExtractor): api_url = 'https://api.bilibili.com/x/web-interface/newlist?rid=%d&type=1&ps=20&jsonp=jsonp' % rid_value page_json = self._download_json(api_url, query, query={'Search_key': query, 'pn': '1'}) - page_data = try_get(page_json, lambda x: x['data']['page'], dict) + page_data = traverse_obj(page_json, ('data', 'page'), expected_type=dict) count, size = int_or_none(page_data.get('count')), int_or_none(page_data.get('size')) if count is None or not size: raise ExtractorError('Failed to calculate either page count or size') @@ -566,7 +576,7 @@ class BilibiliCategoryIE(InfoExtractor): class BiliBiliSearchIE(SearchInfoExtractor): - IE_DESC = 'Bilibili video search, "bilisearch" keyword' + IE_DESC = 'Bilibili video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'bilisearch' @@ -719,40 +729,68 @@ class BiliBiliPlayerIE(InfoExtractor): class BiliIntlBaseIE(InfoExtractor): - _API_URL = 'https://api.bili{}/intl/gateway{}' - - def _call_api(self, type, endpoint, id): - return self._download_json(self._API_URL.format(type, endpoint), id)['data'] + _API_URL = 'https://api.bilibili.tv/intl/gateway' + _NETRC_MACHINE = 'biliintl' + + def _call_api(self, endpoint, *args, **kwargs): + json = self._download_json(self._API_URL + endpoint, *args, **kwargs) + if json.get('code'): + if json['code'] in (10004004, 10004005, 10023006): + self.raise_login_required() + elif json['code'] == 10004001: + self.raise_geo_restricted() + else: + if json.get('message') and str(json['code']) != json['message']: + errmsg = f'{kwargs.get("errnote", "Unable to download JSON metadata")}: {self.IE_NAME} said: {json["message"]}' + else: + errmsg = kwargs.get('errnote', 'Unable to download JSON metadata') + if kwargs.get('fatal'): + raise ExtractorError(errmsg) + else: + self.report_warning(errmsg) + return json.get('data') def json2srt(self, json): data = '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' - for i, line in enumerate(json['body'])) + for i, line in enumerate(json['body']) if line.get('content')) return data - def _get_subtitles(self, type, ep_id): - sub_json = self._call_api(type, f'/m/subtitle?ep_id={ep_id}&platform=web', ep_id) + def _get_subtitles(self, *, ep_id=None, aid=None): + sub_json = self._call_api( + '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', + errnote='Unable to download subtitles list', query=filter_dict({ + 'platform': 'web', + 'episode_id': ep_id, + 'aid': aid, + })) subtitles = {} - for sub in sub_json.get('subtitles', []): + for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') if not sub_url: continue - sub_data = self._download_json(sub_url, ep_id, fatal=False) + sub_data = self._download_json( + sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False, + note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '') if not sub_data: continue - subtitles.setdefault(sub.get('key', 'en'), []).append({ + subtitles.setdefault(sub.get('lang_key', 'en'), []).append({ 'ext': 'srt', 'data': self.json2srt(sub_data) }) return subtitles - def _get_formats(self, type, ep_id): - video_json = self._call_api(type, f'/web/playurl?ep_id={ep_id}&platform=web', ep_id) - if not video_json: - self.raise_login_required(method='cookies') + def _get_formats(self, *, ep_id=None, aid=None): + video_json = self._call_api( + '/web/playurl', ep_id or aid, note='Downloading video formats', + errnote='Unable to download video formats', query=filter_dict({ + 'platform': 'web', + 'ep_id': ep_id, + 'aid': aid, + })) video_json = video_json['playurl'] formats = [] - for vid in video_json.get('video', []): + for vid in video_json.get('video') or []: video_res = vid.get('video_resource') or {} video_info = vid.get('stream_info') or {} if not video_res.get('url'): @@ -768,7 +806,7 @@ class BiliIntlBaseIE(InfoExtractor): 'vcodec': video_res.get('codecs'), 'filesize': video_res.get('size'), }) - for aud in video_json.get('audio_resource', []): + for aud in video_json.get('audio_resource') or []: if not aud.get('url'): continue formats.append({ @@ -783,85 +821,148 @@ class BiliIntlBaseIE(InfoExtractor): self._sort_formats(formats) return formats - def _extract_ep_info(self, type, episode_data, ep_id): + def _extract_video_info(self, video_data, *, ep_id=None, aid=None): return { - 'id': ep_id, - 'title': episode_data.get('long_title') or episode_data['title'], - 'thumbnail': episode_data.get('cover'), - 'episode_number': str_to_int(episode_data.get('title')), - 'formats': self._get_formats(type, ep_id), - 'subtitles': self._get_subtitles(type, ep_id), + 'id': ep_id or aid, + 'title': video_data.get('title_display') or video_data.get('title'), + 'thumbnail': video_data.get('cover'), + 'episode_number': int_or_none(self._search_regex( + r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), 'extractor_key': BiliIntlIE.ie_key(), } + def _perform_login(self, username, password): + try: + from Cryptodome.PublicKey import RSA + from Cryptodome.Cipher import PKCS1_v1_5 + except ImportError: + try: + from Crypto.PublicKey import RSA + from Crypto.Cipher import PKCS1_v1_5 + except ImportError: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) + + key_data = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, + note='Downloading login key', errnote='Unable to download login key')['data'] + + public_key = RSA.importKey(key_data['key']) + password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + login_post = self._download_json( + 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ + 'username': username, + 'password': base64.b64encode(password_hash).decode('ascii'), + 'keep_me': 'true', + 's_locale': 'en_US', + 'isTrusted': 'true' + }), note='Logging in', errnote='Unable to log in') + if login_post.get('code'): + if login_post.get('message'): + raise ExtractorError(f'Unable to log in: {self.IE_NAME} said: {login_post["message"]}', expected=True) + else: + raise ExtractorError('Unable to log in') + class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P\d+)/(?P\d+)|video/(?P\d+))' _TESTS = [{ + # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', 'info_dict': { 'id': '341736', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', + 'title': 'E2 - The First Night', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, - }, - 'params': { - 'format': 'bv', - }, + } }, { - 'url': 'https://www.biliintl.com/en/play/34613/341736', + # Non-Bstation page + 'url': 'https://www.bilibili.tv/en/play/1033760/11005006', 'info_dict': { - 'id': '341736', + 'id': '11005006', 'ext': 'mp4', - 'title': 'The First Night', - 'thumbnail': 'https://i0.hdslb.com/bfs/intl/management/91e30e5521235d9b163339a26a0b030ebda54310.png', - 'episode_number': 2, - }, - 'params': { - 'format': 'bv', + 'title': 'E3 - Who?', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 3, + } + }, { + # Subtitle with empty content + 'url': 'https://www.bilibili.tv/en/play/1005144/10131790', + 'info_dict': { + 'id': '10131790', + 'ext': 'mp4', + 'title': 'E140 - Two Heartbeats: Kabuto\'s Trap', + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode_number': 140, }, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' + }, { + 'url': 'https://www.biliintl.com/en/play/34613/341736', + 'only_matching': True, + }, { + # User-generated content (as opposed to a series licensed from a studio) + 'url': 'https://bilibili.tv/en/video/2019955076', + 'only_matching': True, + }, { + # No language in URL + 'url': 'https://www.bilibili.tv/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): - type, season_id, id = self._match_valid_url(url).groups() - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={season_id}', id) - episode_data = next( - episode for episode in data_json.get('episodes', []) - if str(episode.get('ep_id')) == id) - return self._extract_ep_info(type, episode_data, id) + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + webpage = self._download_webpage(url, video_id) + # Bstation layout + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), video_id, fatal=False) or {} + video_data = ( + traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) + or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + + if season_id and not video_data: + # Non-Bstation layout, read through episode list + season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) + video_data = traverse_obj(season_json, + ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), + expected_type=dict, get_all=False) + return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?Pbili\.tv|intl.com)/(?:[a-z]{2}/)?play/(?P\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P\d+)$' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', + 'title': 'Fly Me to the Moon', + 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', + 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', + 'view_count': int, }, 'params': { 'skip_download': True, - 'format': 'bv', }, }, { 'url': 'https://www.biliintl.com/en/play/34613', - 'playlist_mincount': 15, - 'info_dict': { - 'id': '34613', - }, - 'params': { - 'skip_download': True, - 'format': 'bv', - }, + 'only_matching': True, }] - def _entries(self, id, type): - data_json = self._call_api(type, f'/web/view/ogv_collection?season_id={id}', id) - for episode in data_json.get('episodes', []): - episode_id = str(episode.get('ep_id')) - yield self._extract_ep_info(type, episode, episode_id) + def _entries(self, series_id): + series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): + episode_id = str(episode.get('episode_id')) + yield self._extract_video_info(episode, ep_id=episode_id) def _real_extract(self, url): - type, id = self._match_valid_url(url).groups() - return self.playlist_result(self._entries(id, type), playlist_id=id) + series_id = self._match_id(url) + series_info = self._call_api(f'/web/v2/ogv/play/season_info?season_id={series_id}&platform=web', series_id).get('season') or {} + return self.playlist_result( + self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), + categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), + thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) diff --git a/hypervideo_dl/extractor/biqle.py b/hypervideo_dl/extractor/biqle.py index 17ebbb2..2b57bad 100644 --- a/hypervideo_dl/extractor/biqle.py +++ b/hypervideo_dl/extractor/biqle.py @@ -3,27 +3,28 @@ from __future__ import unicode_literals from .common import InfoExtractor from .vk import VKIE -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote, +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + unified_timestamp, ) -from ..utils import int_or_none class BIQLEIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P-?\d+_\d+)' _TESTS = [{ - # Youtube embed - 'url': 'https://biqle.ru/watch/-115995369_456239081', - 'md5': '97af5a06ee4c29bbf9c001bdb1cf5c06', + 'url': 'https://biqle.ru/watch/-2000421746_85421746', + 'md5': 'ae6ef4f04d19ac84e4658046d02c151c', 'info_dict': { - 'id': '8v4f-avW-VI', + 'id': '-2000421746_85421746', 'ext': 'mp4', - 'title': "PASSE-PARTOUT - L'ete c'est fait pour jouer", - 'description': 'Passe-Partout', - 'uploader_id': 'mrsimpsonstef3', - 'uploader': 'Phanolito', - 'upload_date': '20120822', + 'title': 'Forsaken By Hope Studio Clip', + 'description': 'Forsaken By Hope Studio Clip — Смотреть онлайн', + 'upload_date': '19700101', + 'thumbnail': r're:https://[^/]+/impf/7vN3ACwSTgChP96OdOfzFjUCzFR6ZglDQgWsIw/KPaACiVJJxM\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=b48ea459c4d33dbcba5e26d63574b1cb&type=video_thumb', + 'timestamp': 0, }, }, { 'url': 'http://biqle.org/watch/-44781847_168547604', @@ -32,53 +33,62 @@ class BIQLEIE(InfoExtractor): 'id': '-44781847_168547604', 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', + 'description': 'Ребенок в шоке от автоматической мойки — Смотреть онлайн', 'timestamp': 1396633454, - 'uploader': 'Dmitry Kotov', 'upload_date': '20140404', - 'uploader_id': '47850140', + 'thumbnail': r're:https://[^/]+/c535507/u190034692/video/l_b84df002\.jpg', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - embed_url = self._proto_relative_url(self._search_regex( - r'', - webpage, 'embed url')) + + title = self._html_search_meta('name', webpage, 'Title', fatal=False) + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + global_embed_url = self._search_regex( + r'', webpage, 'Hash') + + embed_url = global_embed_url + hash + if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) embed_page = self._download_webpage( - embed_url, video_id, headers={'Referer': url}) - video_ext = self._get_cookies(embed_url).get('video_ext') - if video_ext: - video_ext = compat_urllib_parse_unquote(video_ext.value) - if not video_ext: - video_ext = compat_b64decode(self._search_regex( - r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', - embed_page, 'video_ext')).decode() - video_id, sig, _, access_token = video_ext.split(':') + embed_url, video_id, 'Downloading embed webpage', headers={'Referer': url}) + + glob_params = self._parse_json(self._search_regex( + r'', + embed_page, 'Global Parameters'), video_id, transform_source=js_to_json) + host_name = compat_b64decode(glob_params['server'][::-1]).decode() + item = self._download_json( - 'https://api.vk.com/method/video.get', video_id, - headers={'User-Agent': 'okhttp/3.4.1'}, query={ - 'access_token': access_token, - 'sig': sig, - 'v': 5.44, + f'https://{host_name}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': glob_params['video']['access_token'], 'videos': video_id, + 'ckey': glob_params['c_key'], + 'credentials': glob_params['video']['credentials'], })['response']['items'][0] - title = item['title'] formats = [] for f_id, f_url in item.get('files', {}).items(): if f_id == 'external': return self.url_result(f_url) ext, height = f_id.split('_') - formats.append({ - 'format_id': height + 'p', - 'url': f_url, - 'height': int_or_none(height), - 'ext': ext, - }) + height_extra_key = traverse_obj(glob_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'https://{host_name}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) self._sort_formats(formats) thumbnails = [] @@ -96,10 +106,9 @@ class BIQLEIE(InfoExtractor): 'title': title, 'formats': formats, 'comment_count': int_or_none(item.get('comments')), - 'description': item.get('description'), + 'description': description, 'duration': int_or_none(item.get('duration')), 'thumbnails': thumbnails, - 'timestamp': int_or_none(item.get('date')), - 'uploader': item.get('owner_id'), + 'timestamp': timestamp, 'view_count': int_or_none(item.get('views')), } diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py index eb16c46..e6e093f 100644 --- a/hypervideo_dl/extractor/bitwave.py +++ b/hypervideo_dl/extractor/bitwave.py @@ -51,7 +51,7 @@ class BitwaveStreamIE(InfoExtractor): return { 'id': username, - 'title': self._live_title(channel['data']['title']), + 'title': channel['data']['title'], 'uploader': username, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/blogger.py b/hypervideo_dl/extractor/blogger.py new file mode 100644 index 0000000..dba131c --- /dev/null +++ b/hypervideo_dl/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P.+)' + _VALID_EMBED = r''']+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py index 9e75511..4e346e7 100644 --- a/hypervideo_dl/extractor/bongacams.py +++ b/hypervideo_dl/extractor/bongacams.py @@ -49,7 +49,7 @@ class BongaCamsIE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(uploader or uploader_id), + 'title': uploader or uploader_id, 'uploader': uploader, 'uploader_id': uploader_id, 'like_count': like_count, diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py index 7169ece..0155827 100644 --- a/hypervideo_dl/extractor/br.py +++ b/hypervideo_dl/extractor/br.py @@ -175,7 +175,7 @@ class BRIE(InfoExtractor): class BRMediathekIE(InfoExtractor): IE_DESC = 'Bayerischer Rundfunk Mediathek' - _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek/video/[^/?&#]*?-(?Pav:[0-9a-f]{24})' + _VALID_URL = r'https?://(?:www\.)?br\.de/mediathek//?video/(?:[^/?&#]+?-)?(?Pav:[0-9a-f]{24})' _TESTS = [{ 'url': 'https://www.br.de/mediathek/video/gesundheit-die-sendung-vom-28112017-av:5a1e6a6e8fce6d001871cc8e', @@ -188,6 +188,9 @@ class BRMediathekIE(InfoExtractor): 'timestamp': 1511942766, 'upload_date': '20171129', } + }, { + 'url': 'https://www.br.de/mediathek//video/av:61b0db581aed360007558c12', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/breitbart.py b/hypervideo_dl/extractor/breitbart.py new file mode 100644 index 0000000..e029aa6 --- /dev/null +++ b/hypervideo_dl/extractor/breitbart.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py index cd1c3f0..dcd332b 100644 --- a/hypervideo_dl/extractor/brightcove.py +++ b/hypervideo_dl/extractor/brightcove.py @@ -16,6 +16,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, extract_attributes, ExtractorError, find_xpath_attr, @@ -471,32 +472,22 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() - num_drm_sources = 0 formats, subtitles = [], {} sources = json_data.get('sources') or [] for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - skip_unplayable = not self.get_param('allow_unplayable_formats') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if skip_unplayable and (container == 'WVM' or source.get('key_systems')): - num_drm_sources += 1 - continue - elif ext == 'ism' and skip_unplayable: - continue - elif ext == 'm3u8' or container == 'M2TS': + if ext == 'm3u8' or container == 'M2TS': if not src: continue - f, subs = self._extract_m3u8_formats_and_subtitles( + fmts, subs = self._extract_m3u8_formats_and_subtitles( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(f) subtitles = self._merge_subtitles(subtitles, subs) elif ext == 'mpd': if not src: continue - f, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) - formats.extend(f) + fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) subtitles = self._merge_subtitles(subtitles, subs) else: streaming_src = source.get('streaming_src') @@ -543,7 +534,13 @@ class BrightcoveNewIE(AdobePassIE): 'play_path': stream_name, 'format_id': build_format_id('rtmp'), }) - formats.append(f) + fmts = [f] + + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems') or ext == 'ism': + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) if not formats: errors = json_data.get('errors') @@ -551,9 +548,6 @@ class BrightcoveNewIE(AdobePassIE): error = errors[0] self.raise_no_formats( error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - elif (not self.get_param('allow_unplayable_formats') - and sources and num_drm_sources == len(sources)): - self.report_drm(video_id) self._sort_formats(formats) @@ -577,11 +571,19 @@ class BrightcoveNewIE(AdobePassIE): if duration is not None and duration <= 0: is_live = True + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(json_data.get('description')), - 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'thumbnails': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), diff --git a/hypervideo_dl/extractor/cableav.py b/hypervideo_dl/extractor/cableav.py new file mode 100644 index 0000000..77efdf4 --- /dev/null +++ b/hypervideo_dl/extractor/cableav.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from .common import InfoExtractor + + +class CableAVIE(InfoExtractor): + _VALID_URL = r'https://cableav\.tv/(?P[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://cableav.tv/lS4iR9lWjN8/', + 'md5': '7e3fe5e49d61c4233b7f5b0f69b15e18', + 'info_dict': { + 'id': 'lS4iR9lWjN8', + 'ext': 'mp4', + 'title': '國產麻豆AV 叮叮映畫 DDF001 情欲小說家 - CableAV', + 'description': '國產AV 480p, 720p 国产麻豆AV 叮叮映画 DDF001 情欲小说家', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py new file mode 100644 index 0000000..1f3b7cf --- /dev/null +++ b/hypervideo_dl/extractor/callin.py @@ -0,0 +1,114 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + float_or_none, + int_or_none +) + + +class CallinIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?callin\.com/(episode)/(?P[-a-zA-Z]+)' + _TESTS = [{ + 'url': 'https://www.callin.com/episode/the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'info_dict': { + 'id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd', + 'title': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'ext': 'ts', + 'display_id': 'the-title-ix-regime-and-the-long-march-through-EBfXYSrsjc', + 'thumbnail': 're:https://.+\\.png', + 'description': 'First episode', + 'uploader': 'Wesley Yang', + 'timestamp': 1639404128.65, + 'upload_date': '20211213', + 'uploader_id': 'wesyang', + 'uploader_url': 'http://wesleyyang.substack.com', + 'channel': 'Conversations in Year Zero', + 'channel_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'channel_url': 'https://callin.com/show/conversations-in-year-zero-oJNllRFSfx', + 'duration': 9951.936, + 'view_count': int, + 'categories': ['News & Politics', 'History', 'Technology'], + 'cast': ['Wesley Yang', 'KC Johnson', 'Gabi Abramovich'], + 'series': 'Conversations in Year Zero', + 'series_id': '436d1f82ddeb30cd2306ea9156044d8d2cfdc3f1f1552d245117a42173e78553', + 'episode': 'The Title IX Regime and the Long March Through and Beyond the Institutions', + 'episode_number': 1, + 'episode_id': '218b979630a35ead12c6fd096f2996c56c37e4d0dc1f6dc0feada32dcf7b31cd' + } + }] + + def try_get_user_name(self, d): + names = [d.get(n) for n in ('first', 'last')] + if None in names: + return next((n for n in names if n), default=None) + return ' '.join(names) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + next_data = self._search_nextjs_data(webpage, display_id) + episode = next_data['props']['pageProps']['episode'] + + id = episode['id'] + title = (episode.get('title') + or self._og_search_title(webpage, fatal=False) + or self._html_extract_title(webpage)) + url = episode['m3u8'] + formats = self._extract_m3u8_formats(url, display_id, ext='ts') + self._sort_formats(formats) + + show = traverse_obj(episode, ('show', 'title')) + show_id = traverse_obj(episode, ('show', 'id')) + + show_json = None + app_slug = (self._html_search_regex( + '[a-z0-9_]+)\.htm' + _TEST = { + 'url': 'https://cwwp2.dot.ca.gov/vm/loc/d3/hwy50at24th.htm', + 'info_dict': { + 'id': 'hwy50at24th', + 'ext': 'ts', + 'title': 'US-50 : Sacramento : Hwy 50 at 24th', + 'live_status': 'is_live', + 'thumbnail': 'https://cwwp2.dot.ca.gov/data/d3/cctv/image/hwy50at24th/hwy50at24th.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + global_vars = self._search_regex( + r'', + webpage, 'Global Vars') + route_place = self._search_regex(r'routePlace\s*=\s*"([^"]+)"', global_vars, 'Route Place', fatal=False) + location_name = self._search_regex(r'locationName\s*=\s*"([^"]+)"', global_vars, 'Location Name', fatal=False) + poster_url = self._search_regex(r'posterURL\s*=\s*"([^"]+)"', global_vars, 'Poster Url', fatal=False) + video_stream = self._search_regex(r'videoStreamURL\s*=\s*"([^"]+)"', global_vars, 'Video Stream URL', fatal=False) + + formats = self._extract_m3u8_formats(video_stream, video_id, 'ts', live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': f'{route_place} : {location_name}', + 'is_live': True, + 'formats': formats, + 'thumbnail': poster_url, + } diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py index 30daf2b..2a3931f 100644 --- a/hypervideo_dl/extractor/cam4.py +++ b/hypervideo_dl/extractor/cam4.py @@ -13,6 +13,8 @@ class CAM4IE(InfoExtractor): 'ext': 'mp4', 'title': 're:^foxynesss [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'age_limit': 18, + 'live_status': 'is_live', + 'thumbnail': 'https://snapshots.xcdnpro.com/thumbnails/foxynesss', } } @@ -25,8 +27,9 @@ class CAM4IE(InfoExtractor): return { 'id': channel_id, - 'title': self._live_title(channel_id), + 'title': channel_id, 'is_live': True, 'age_limit': 18, 'formats': formats, + 'thumbnail': f'https://snapshots.xcdnpro.com/thumbnails/{channel_id}', } diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py index eb2a8b4..3dc1937 100644 --- a/hypervideo_dl/extractor/cammodels.py +++ b/hypervideo_dl/extractor/cammodels.py @@ -91,7 +91,7 @@ class CamModelsIE(InfoExtractor): return { 'id': user_id, - 'title': self._live_title(user_id), + 'title': user_id, 'is_live': True, 'formats': formats, 'age_limit': 18 diff --git a/hypervideo_dl/extractor/canalalpha.py b/hypervideo_dl/extractor/canalalpha.py new file mode 100644 index 0000000..0365cb2 --- /dev/null +++ b/hypervideo_dl/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], video_id=id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py index 49e7e4e..8b99037 100644 --- a/hypervideo_dl/extractor/canvas.py +++ b/hypervideo_dl/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,18 +60,23 @@ class CanvasIE(InfoExtractor): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) - if not data.get('title'): + if 'title' not in data: code = data.get('code') if code == 'AUTHENTICATION_REQUIRED': self.raise_login_required() @@ -78,7 +84,8 @@ class CanvasIE(InfoExtractor): self.raise_geo_restricted(countries=['BE']) raise ExtractorError(data.get('message') or code, expected=True) - title = data['title'] + # Note: The title may be an empty string + title = data['title'] or f'{site_id} {video_id}' description = data.get('description') formats = [] @@ -238,10 +245,6 @@ class VrtNUIE(GigyaBaseIE): 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['is not a supported codec'], }, { # Only available via new API endpoint @@ -257,34 +260,20 @@ class VrtNUIE(GigyaBaseIE): 'episode_number': 5, }, 'skip': 'This video is only available for registered users', - 'params': { - 'username': '', - 'password': '', - }, 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + def _perform_login(self, username, password): + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) if auth_info.get('errorDetails'): raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) @@ -301,14 +290,15 @@ class VrtNUIE(GigyaBaseIE): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/hypervideo_dl/extractor/carambatv.py b/hypervideo_dl/extractor/carambatv.py index b57b86a..7e5cc90 100644 --- a/hypervideo_dl/extractor/carambatv.py +++ b/hypervideo_dl/extractor/carambatv.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + format_field, float_or_none, int_or_none, try_get, @@ -43,7 +44,7 @@ class CarambaTVIE(InfoExtractor): formats = [{ 'url': base_url + f['fn'], 'height': int_or_none(f.get('height')), - 'format_id': '%sp' % f['height'] if f.get('height') else None, + 'format_id': format_field(f, 'height', '%sp'), } for f in video['qualities'] if f.get('fn')] self._sort_formats(formats) diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index 2429521..4892419 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -2,17 +2,22 @@ from __future__ import unicode_literals import re +import json +import base64 +import time from .common import InfoExtractor from ..compat import ( compat_str, ) from ..utils import ( + int_or_none, + join_nonempty, js_to_json, - smuggle_url, - try_get, orderedSet, + smuggle_url, strip_or_none, + try_get, ExtractorError, ) @@ -122,9 +127,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( - r'([^<]+)', webpage, 'title', fatal=False) + title = (self._og_search_title(webpage, default=None) + or self._html_search_meta('twitter:title', webpage, 'title', default=None) + or self._html_extract_title(webpage)) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -244,37 +249,129 @@ class CBCGemIE(InfoExtractor): 'params': {'format': 'bv'}, 'skip': 'Geo-restricted to Canada', }] - _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + + _GEO_COUNTRIES = ['CA'] + _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37' + _NETRC_MACHINE = 'cbcgem' + _claims_token = None + + def _new_claims_token(self, email, password): + data = json.dumps({ + 'email': email, + 'password': password, + }).encode() + headers = {'content-type': 'application/json'} + query = {'apikey': self._TOKEN_API_KEY} + resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login', + None, data=data, headers=headers, query=query) + access_token = resp['access_token'] + + query = { + 'access_token': access_token, + 'apikey': self._TOKEN_API_KEY, + 'jwtapp': 'jwt', + } + resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token', + None, headers=headers, query=query) + sig = resp['signature'] + + data = json.dumps({'jwt': sig}).encode() + headers = {'content-type': 'application/json', 'ott-device-type': 'web'} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', + None, data=data, headers=headers) + cbc_access_token = resp['accessToken'] + + headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} + resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', + None, headers=headers) + return resp['claimsToken'] + + def _get_claims_token_expiry(self): + # Token is a JWT + # JWT is decoded here and 'exp' field is extracted + # It is a Unix timestamp for when the token expires + b64_data = self._claims_token.split('.')[1] + data = base64.urlsafe_b64decode(b64_data + "==") + return json.loads(data)['exp'] + + def claims_token_expired(self): + exp = self._get_claims_token_expiry() + if exp - time.time() < 10: + # It will expire in less than 10 seconds, or has already expired + return True + return False + + def claims_token_valid(self): + return self._claims_token is not None and not self.claims_token_expired() + + def _get_claims_token(self, email, password): + if not self.claims_token_valid(): + self._claims_token = self._new_claims_token(email, password) + self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + return self._claims_token + + def _real_initialize(self): + if self.claims_token_valid(): + return + self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: + return + + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + # Note: \g<1> is necessary instead of \1 since bitrate is a number + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\g<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._download_json(self._API_BASE + video_id, video_id) - - last_error = None - attempt = -1 - retries = self.get_param('extractor_retries', 15) - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) - m3u8_info = self._download_json( - video_info['playSession']['url'], video_id, - note='Downloading JSON metadata%s' % f' (attempt {attempt})') - m3u8_url = m3u8_info.get('url') - if m3u8_url: - break - elif m3u8_info.get('errorCode') == 1: - self.raise_geo_restricted(countries=['CA']) - else: - last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}' - # 35 means media unavailable, but retries work - if m3u8_info.get('errorCode') != 35 or attempt >= retries: - raise ExtractorError(last_error) + video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) + + email, password = self._get_login_info() + if email and password: + claims_token = self._get_claims_token(email, password) + headers = {'x-claims-token': claims_token} + else: + headers = {} + m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers) + m3u8_url = m3u8_info.get('url') + + if m3u8_info.get('errorCode') == 1: + self.raise_geo_restricted(countries=['CA']) + elif m3u8_info.get('errorCode') == 35: + self.raise_login_required(method='password') + elif m3u8_info.get('errorCode') != 0: + raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}') formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) - for i, format in enumerate(formats): + for format in formats: if format.get('vcodec') == 'none': if format.get('ext') is None: format['ext'] = 'm4a' @@ -328,7 +425,8 @@ class CBCGemPlaylistIE(InfoExtractor): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}') @@ -377,7 +475,7 @@ class CBCGemPlaylistIE(InfoExtractor): class CBCGemLiveIE(InfoExtractor): IE_NAME = 'gem.cbc.ca:live' - _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P[0-9]{12})' + _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P\d+)' _TEST = { 'url': 'https://gem.cbc.ca/live/920604739687', 'info_dict': { @@ -396,21 +494,21 @@ class CBCGemLiveIE(InfoExtractor): # It's unclear where the chars at the end come from, but they appear to be # constant. Might need updating in the future. - _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT' + # There are two URLs, some livestreams are in one, and some + # in the other. The JSON schema is the same for both. + _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT'] def _real_extract(self, url): video_id = self._match_id(url) - live_info = self._download_json(self._API, video_id)['entries'] - video_info = None - for stream in live_info: - if stream.get('guid') == video_id: - video_info = stream - - if video_info is None: - raise ExtractorError( - 'Couldn\'t find video metadata, maybe this livestream is now offline', - expected=True) + for api_url in self._API_URLS: + video_info = next(( + stream for stream in self._download_json(api_url, video_id)['entries'] + if stream.get('guid') == video_id), None) + if video_info: + break + else: + raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) return { '_type': 'url_transparent', diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py index ae9ce58..2af36ea 100644 --- a/hypervideo_dl/extractor/cbs.py +++ b/hypervideo_dl/extractor/cbs.py @@ -77,21 +77,21 @@ class CBSIE(CBSBaseIE): (?: cbs:| https?://(?:www\.)?(?: - cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/| + cbs\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/| colbertlateshow\.com/(?:video|podcasts)/) )(?P[\w-]+)''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'url': 'https://www.cbs.com/shows/video/xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R/', 'info_dict': { - 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'id': 'xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R', 'ext': 'mp4', - 'title': 'Connect Chat feat. Garth Brooks', - 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - 'duration': 1495, - 'timestamp': 1385585425, - 'upload_date': '20131127', + 'title': 'Tough As Nails - Dreams Never Die', + 'description': 'md5:a3535a62531cdd52b0364248a2c1ae33', + 'duration': 2588, + 'timestamp': 1639015200, + 'upload_date': '20211209', 'uploader': 'CBSI-NEW', }, 'params': { @@ -99,14 +99,14 @@ class CBSIE(CBSBaseIE): 'skip_download': True, }, }, { - 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-', + 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { - 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2', - 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)', - 'timestamp': 1624507140, - 'description': 'md5:e01af24e95c74d55e8775aef86117b95', + 'id': 'sZH1MGgomIosZgxGJ1l263MFq16oMtW1', + 'title': 'The Late Show - 3/16/22 (Michael Buble, Rose Matafeo)', + 'timestamp': 1647488100, + 'description': 'md5:d0e6ec23c544b7fa8e39a8e6844d2439', 'uploader': 'CBSI-NEW', - 'upload_date': '20210624', + 'upload_date': '20220317', }, 'params': { 'ignore_no_formats_error': True, diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py index ea98f86..9dbaabf 100644 --- a/hypervideo_dl/extractor/ccma.py +++ b/hypervideo_dl/extractor/ccma.py @@ -1,17 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .common import InfoExtractor from ..utils import ( clean_html, - extract_timezone, int_or_none, parse_duration, parse_resolution, try_get, + unified_timestamp, url_or_none, ) @@ -95,14 +92,8 @@ class CCMAIE(InfoExtractor): duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) tematica = try_get(informacio, lambda x: x['tematica']['text']) - timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) - try: - timezone, data_utc = extract_timezone(data_utc) - timestamp = calendar.timegm((datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) - except TypeError: - pass + timestamp = unified_timestamp(data_utc) subtitles = {} subtitols = media.get('subtitols') or [] diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py index 9b86121..0ed5f32 100644 --- a/hypervideo_dl/extractor/cctv.py +++ b/hypervideo_dl/extractor/cctv.py @@ -162,7 +162,8 @@ class CCTVIE(InfoExtractor): 'url': video_url, 'format_id': 'http', 'quality': quality, - 'source_preference': -10 + # Sample clip + 'preference': -10 }) hls_url = try_get(data, lambda x: x['hls_url'], compat_str) diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py index 5e04d38..ddf66b2 100644 --- a/hypervideo_dl/extractor/ceskatelevize.py +++ b/hypervideo_dl/extractor/ceskatelevize.py @@ -12,30 +12,15 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, - unescapeHTML, - update_url_query, + traverse_obj, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +51,60 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + next_data = self._search_nextjs_data(webpage, playlist_id) + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + raise ExtractorError('Failed to find IDEC id') + iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) + webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, + query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s

    ' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +133,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +141,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +163,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -147,6 +177,7 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): + stream_url = stream_url.replace('https://', 'http://') if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', @@ -182,8 +213,6 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_len == 1: final_title = playlist_title or title - if is_live: - final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) @@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r']*\bdata-url=(["\'])(?P(?:(?!\1).)+)\1', - r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/hypervideo_dl/extractor/chaturbate.py b/hypervideo_dl/extractor/chaturbate.py index a459dcb..8da51f9 100644 --- a/hypervideo_dl/extractor/chaturbate.py +++ b/hypervideo_dl/extractor/chaturbate.py @@ -101,7 +101,7 @@ class ChaturbateIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(video_id), + 'title': video_id, 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py index 6bdc4f6..e6841fb 100644 --- a/hypervideo_dl/extractor/chingari.py +++ b/hypervideo_dl/extractor/chingari.py @@ -67,7 +67,7 @@ class ChingariBaseIE(InfoExtractor): class ChingariIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/share/post\?id=(?P[^&/#?]+)' _TESTS = [{ 'url': 'https://chingari.io/share/post?id=612f8f4ce1dc57090e8a7beb', 'info_dict': { @@ -102,7 +102,7 @@ class ChingariIE(ChingariBaseIE): class ChingariUserIE(ChingariBaseIE): - _VALID_URL = r'(?:https?://)(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' + _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P[^/?]+)' _TESTS = [{ 'url': 'https://chingari.io/dada1023', 'playlist_mincount': 3, diff --git a/hypervideo_dl/extractor/closertotruth.py b/hypervideo_dl/extractor/closertotruth.py index 26243d5..517e121 100644 --- a/hypervideo_dl/extractor/closertotruth.py +++ b/hypervideo_dl/extractor/closertotruth.py @@ -54,8 +54,7 @@ class CloserToTruthIE(InfoExtractor): r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', webpage, 'kaltura partner_id') - title = self._search_regex( - r'(.+?)\s*\|\s*.+?', webpage, 'video title') + title = self._html_extract_title(webpage, 'video title') select = self._search_regex( r'(?s)]+id="select-version"[^>]*>(.+?)', diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index df74c75..0035191 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import base64 -import datetime +import collections import hashlib import itertools import json @@ -45,15 +45,18 @@ from ..utils import ( determine_ext, determine_protocol, dict_get, + encode_data_uri, error_to_compat_str, extract_attributes, ExtractorError, + filter_dict, fix_xml_ampersands, float_or_none, format_field, GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -73,7 +76,9 @@ from ..utils import ( str_to_int, strip_or_none, traverse_obj, + try_get, unescapeHTML, + UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -134,6 +139,8 @@ class InfoExtractor(object): for HDS - URL of the F4M manifest, for DASH - URL of the MPD manifest, for MSS - URL of the ISM manifest. + * manifest_stream_number (For internal use only) + The index of the stream in the manifest file * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). @@ -161,9 +168,8 @@ class InfoExtractor(object): * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual - download, lower-case. - "http", "https", "rtsp", "rtmp", "rtmp_ffmpeg", "rtmpe", - "m3u8", "m3u8_native" or "http_dash_segments". + download, lower-case. One of "http", "https" or + one of the protocols defined in downloader.PROTOCOL_MAP * fragment_base_url Base URL for fragments. Each fragment's path value (if present) will be relative to @@ -179,6 +185,8 @@ class InfoExtractor(object): fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) + * is_from_start Is a live format that can be downloaded + from the start. Boolean * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. @@ -209,7 +217,7 @@ class InfoExtractor(object): (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean * downloader_options A dictionary of downloader options as - described in FileDownloader + described in FileDownloader (For internal use only) RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -221,6 +229,7 @@ class InfoExtractor(object): The following fields are optional: + direct: True if a direct video file was given (must only be set by GenericIE) alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is @@ -235,16 +244,22 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) + * "http_headers" (dict) - HTTP headers for the request thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video was uploaded - upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. + upload_date: Video upload date in UTC (YYYYMMDD). + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released in UTC. + If not explicitly set, calculated from release_timestamp + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified in UTC. + If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. @@ -252,6 +267,7 @@ class InfoExtractor(object): fields. This depends on a particular extractor. channel_id: Id of the channel. channel_url: Full URL to a channel webpage. + channel_follower_count: Number of followers of the channel. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -262,6 +278,8 @@ class InfoExtractor(object): * "url": A URL pointing to the subtitles file It can optionally also have: * "name": Name or description of the subtitles + * "http_headers": A dictionary of additional HTTP headers + to add to the request. "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated captions instead of normal subtitles @@ -340,6 +358,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -366,6 +385,7 @@ class InfoExtractor(object): disc_number: Number of the disc or other physical medium the track belongs to, as an integer. release_year: Year (YYYY) when the album was released. + composer: Composer of the piece Unless mentioned otherwise, the fields should be Unicode strings. @@ -379,6 +399,11 @@ class InfoExtractor(object): Additionally, playlists can have "id", "title", and any other relevent attributes with the same semantics as videos (see above). + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. @@ -404,13 +429,21 @@ class InfoExtractor(object): title, description etc. - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. + Subclasses of this should define a _VALID_URL regexp and, re-define the + _real_extract() and (optionally) _real_initialize() methods. Probably, they should also be added to the list of extractors. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs - (except other extractors), so that lazy_extractors works correctly + (except other extractors), so that lazy_extractors works correctly. + + To support username + password (or netrc) login, the extractor must define a + _NETRC_MACHINE and re-define _perform_login(username, password) and + (optionally) _initialize_pre_login() methods. The _perform_login method will + be called between _initialize_pre_login and _real_initialize if credentials + are passed by the user. In cases where it is necessary to have the login + process as part of the extraction rather than initialization, _perform_login + can be left undefined. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -438,17 +471,21 @@ class InfoExtractor(object): _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _NETRC_MACHINE = None + IE_DESC = None _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" + """Constructor. Receives an optional downloader (a YoutubeDL instance). + If a downloader is not passed during initialization, + it must be set using "set_downloader()" before "extract()" is called""" self._ready = False self._x_forwarded_for_ip = None self._printed_messages = set() @@ -460,6 +497,8 @@ class InfoExtractor(object): # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: + if '_VALID_URL' not in cls.__dict__: + cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -486,6 +525,10 @@ class InfoExtractor(object): """Getter method for _WORKING.""" return cls._WORKING + @classmethod + def supports_login(cls): + return bool(cls._NETRC_MACHINE) + def initialize(self): """Initializes an instance (authentication, etc).""" self._printed_messages = set() @@ -494,6 +537,13 @@ class InfoExtractor(object): 'ip_blocks': self._GEO_IP_BLOCKS, }) if not self._ready: + self._initialize_pre_login() + if self.supports_login(): + username, password = self._get_login_info() + if username: + self._perform_login(username, password) + elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): + self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}') self._real_initialize() self._ready = True @@ -602,10 +652,19 @@ class InfoExtractor(object): if self.__maybe_fake_ip_and_retry(e.countries): continue raise + except UnsupportedError: + raise except ExtractorError as e: - video_id = e.video_id or self.get_temp_id(url) - raise ExtractorError( - e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause) + kwargs = { + 'video_id': e.video_id or self.get_temp_id(url), + 'ie': self.IE_NAME, + 'tb': e.traceback or sys.exc_info()[2], + 'expected': e.expected, + 'cause': e.cause + } + if hasattr(e, 'countries'): + kwargs['countries'] = e.countries + raise type(e)(e.orig_msg, **kwargs) except compat_http_client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: @@ -627,16 +686,24 @@ class InfoExtractor(object): return False def set_downloader(self, downloader): - """Sets the downloader for this IE.""" + """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + def _initialize_pre_login(self): + """ Intialization before login. Redefine in subclasses.""" + pass + + def _perform_login(self, username, password): + """ Login with username and password. Redefine in subclasses.""" + pass + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass def _real_extract(self, url): """Real extraction process. Redefine in subclasses.""" - pass + raise NotImplementedError('This method must be implemented by subclasses') @classmethod def ie_key(cls): @@ -664,7 +731,7 @@ class InfoExtractor(object): See _download_webpage docstring for arguments specification. """ if not self._downloader._first_webpage_request: - sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0 + sleep_interval = self.get_param('sleep_interval_requests') or 0 if sleep_interval > 0: self.to_screen('Sleeping %s seconds ...' % sleep_interval) time.sleep(sleep_interval) @@ -715,7 +782,7 @@ class InfoExtractor(object): errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) if fatal: - raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + raise ExtractorError(errmsg, cause=err) else: self.report_warning(errmsg) return False @@ -970,7 +1037,7 @@ class InfoExtractor(object): if transform_source: json_string = transform_source(json_string) try: - return json.loads(json_string) + return json.loads(json_string, strict=False) except ValueError as ve: errmsg = '%s: Failed to parse JSON ' % video_id if fatal: @@ -1063,23 +1130,30 @@ class InfoExtractor(object): def raise_login_required( self, msg='This video is only available for registered users', - metadata_available=False, method='any'): - if metadata_available and self.get_param('ignore_no_formats_error'): + metadata_available=False, method=NO_DEFAULT): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) + return + if method is NO_DEFAULT: + method = 'any' if self.supports_login() else 'cookies' if method is not None: + assert method in self._LOGIN_HINTS, 'Invalid login method' msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) raise ExtractorError(msg, expected=True) def raise_geo_restricted( self, msg='This video is not available from your location due to geo restriction', countries=None, metadata_available=False): - if metadata_available and self.get_param('ignore_no_formats_error'): + if metadata_available and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) else: raise GeoRestrictedError(msg, countries=countries) def raise_no_formats(self, msg, expected=False, video_id=None): - if expected and self.get_param('ignore_no_formats_error'): + if expected and ( + self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg, video_id) elif isinstance(msg, ExtractorError): raise msg @@ -1088,39 +1162,39 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None, video_title=None, **kwargs): + def url_result(url, ie=None, video_id=None, video_title=None, *, url_transparent=False, **kwargs): """Returns a URL that points to a page that should be processed""" - # TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - video_info.update(kwargs) + if ie is not None: + kwargs['ie_key'] = ie if isinstance(ie, str) else ie.ie_key() if video_id is not None: - video_info['id'] = video_id + kwargs['id'] = video_id if video_title is not None: - video_info['title'] = video_title - return video_info + kwargs['title'] = video_title + return { + **kwargs, + '_type': 'url_transparent' if url_transparent else 'url', + 'url': url, + } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): - urls = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urls, playlist_id=playlist_id, playlist_title=playlist_title) + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): + urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) + for m in orderedSet(map(getter, matches) if getter else matches)) + return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - video_info.update(kwargs) if playlist_id: - video_info['id'] = playlist_id + kwargs['id'] = playlist_id if playlist_title: - video_info['title'] = playlist_title + kwargs['title'] = playlist_title if playlist_description is not None: - video_info['description'] = playlist_description - return video_info + kwargs['description'] = playlist_description + return { + **kwargs, + '_type': 'multi_video' if multi_video else 'playlist', + 'entries': entries, + } def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ @@ -1137,7 +1211,7 @@ class InfoExtractor(object): if mobj: break - _name = self._downloader._color_text(name, 'blue') + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) if mobj: if group is None: @@ -1225,8 +1299,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' - % {'prop': re.escape(prop)}) + property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)' + % {'prop': re.escape(prop), 'sep': '(?::|[:-])'}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -1257,8 +1331,8 @@ class InfoExtractor(object): def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) - def _og_search_title(self, html, **kargs): - return self._og_search_property('title', html, **kargs) + def _og_search_title(self, html, *, fatal=False, **kargs): + return self._og_search_property('title', html, fatal=fatal, **kargs) def _og_search_video_url(self, html, name='video url', secure=True, **kargs): regexes = self._og_regexes('video') + self._og_regexes('video:url') @@ -1269,6 +1343,9 @@ class InfoExtractor(object): def _og_search_url(self, html, **kargs): return self._og_search_property('url', html, **kargs) + def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): + return self._html_search_regex(r'(?s)([^<]+)', html, name, fatal=fatal, **kwargs) + def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) if display_name is None: @@ -1409,6 +1486,23 @@ class InfoExtractor(object): continue info[count_key] = interaction_count + def extract_chapter_information(e): + chapters = [{ + 'title': part.get('name'), + 'start_time': part.get('startOffset'), + 'end_time': part.get('endOffset'), + } for part in variadic(e.get('hasPart') or []) if part.get('@type') == 'Clip'] + for idx, (last_c, current_c, next_c) in enumerate(zip( + [{'end_time': 0}] + chapters, chapters, chapters[1:])): + current_c['end_time'] = current_c['end_time'] or next_c['start_time'] + current_c['start_time'] = current_c['start_time'] or last_c['end_time'] + if None in current_c.values(): + self.report_warning(f'Chapter {idx} contains broken data. Not extracting chapters') + return + if chapters: + chapters[-1]['end_time'] = chapters[-1]['end_time'] or info['duration'] + info['chapters'] = chapters + def extract_video_object(e): assert e['@type'] == 'VideoObject' author = e.get('author') @@ -1416,7 +1510,8 @@ class InfoExtractor(object): 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), + 'thumbnails': [{'url': url_or_none(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. @@ -1431,12 +1526,21 @@ class InfoExtractor(object): 'view_count': int_or_none(e.get('interactionCount')), }) extract_interaction_statistic(e) + extract_chapter_information(e) - for e in json_ld: - if '@context' in e: + def traverse_json_ld(json_ld, at_top_level=True): + for e in json_ld: + if at_top_level and '@context' not in e: + continue + if at_top_level and set(e.keys()) == {'@context', '@graph'}: + traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) + break item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1466,8 +1570,10 @@ class InfoExtractor(object): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), - 'description': unescapeHTML(e.get('articleBody')), + 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) + if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + extract_video_object(e['video'][0]) elif item_type == 'VideoObject': extract_video_object(e) if expected_type is None: @@ -1481,7 +1587,34 @@ class InfoExtractor(object): continue else: break - return dict((k, v) for k, v in info.items() if v is not None) + traverse_json_ld(json_ld) + + return filter_dict(info) + + def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): + return self._parse_json( + self._search_regex( + r'(?s)]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)', + webpage, 'next.js data', fatal=fatal, **kw), + video_id, transform_source=transform_source, fatal=fatal) + + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): + ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' + # not all website do this, but it can be changed + # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + rectx = re.escape(context_name) + js, arg_keys, arg_vals = self._search_regex( + (r'' % rectx, + r'%s\(.*?\(function\((?P.*?)\)\{return\s(?P\{.*?\})\}\((?P.*?)\)' % rectx), + webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + + args = dict(zip(arg_keys.split(','), arg_vals.split(','))) + + for key, val in args.items(): + if val in ('undefined', 'void 0'): + args[key] = 'null' + + return self._parse_json(js_to_json(js, args), video_id)['data'][0] @staticmethod def _hidden_inputs(html): @@ -1510,20 +1643,20 @@ class InfoExtractor(object): default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'format_id') # These must not be aliases + 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'format_id') + 'fps', 'fs_approx', 'source', 'id') settings = { 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.+', '.*dash', 'ws|websocket', '', 'mms|rtsp', 'none', 'f4']}, + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, 'vext': {'type': 'ordered', 'field': 'video_ext', 'order': ('mp4', 'webm', 'flv', '', 'none'), 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, @@ -1537,8 +1670,8 @@ class InfoExtractor(object): 'ie_pref': {'priority': True, 'type': 'extractor'}, 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'ignore', 'field': 'language_preference'}, - 'quality': {'convert': 'float_none', 'default': -1}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, 'filesize': {'convert': 'bytes'}, 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, 'id': {'convert': 'string', 'field': 'format_id'}, @@ -1549,7 +1682,7 @@ class InfoExtractor(object): 'vbr': {'convert': 'float_none'}, 'abr': {'convert': 'float_none'}, 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'ignore', 'field': 'source_preference'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, @@ -1558,39 +1691,51 @@ class InfoExtractor(object): 'res': {'type': 'multiple', 'field': ('height', 'width'), 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - # Most of these exist only for compatibility reasons - 'dimension': {'type': 'alias', 'field': 'res'}, - 'resolution': {'type': 'alias', 'field': 'res'}, - 'extension': {'type': 'alias', 'field': 'ext'}, - 'bitrate': {'type': 'alias', 'field': 'br'}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr'}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr'}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr'}, - 'framerate': {'type': 'alias', 'field': 'fps'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, # not named as 'language' because such a field exists - 'protocol': {'type': 'alias', 'field': 'proto'}, + # For compatibility with youtube-dl + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - 'filesize_estimate': {'type': 'alias', 'field': 'size'}, - 'samplerate': {'type': 'alias', 'field': 'asr'}, - 'video_ext': {'type': 'alias', 'field': 'vext'}, - 'audio_ext': {'type': 'alias', 'field': 'aext'}, - 'video_codec': {'type': 'alias', 'field': 'vcodec'}, - 'audio_codec': {'type': 'alias', 'field': 'acodec'}, - 'video': {'type': 'alias', 'field': 'hasvid'}, - 'has_video': {'type': 'alias', 'field': 'hasvid'}, - 'audio': {'type': 'alias', 'field': 'hasaud'}, - 'has_audio': {'type': 'alias', 'field': 'hasaud'}, - 'extractor': {'type': 'alias', 'field': 'ie_pref'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref'}, - 'format_id': {'type': 'alias', 'field': 'id'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, } - _order = [] + def __init__(self, ie, field_preference): + self._order = [] + self.ydl = ie._downloader + self.evaluate_params(self.ydl.params, field_preference) + if ie.get_param('verbose'): + self.print_verbose_info(self.ydl.write_debug) def _get_field_setting(self, field, key): if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecation_warning( + f'Using arbitrary fields ({field}) for format sorting is deprecated ' + 'and may be removed in a future version') self.settings[field] = {} propObj = self.settings[field] if key not in propObj: @@ -1673,7 +1818,11 @@ class InfoExtractor(object): if field is None: continue if self._get_field_setting(field, 'type') == 'alias': - field = self._get_field_setting(field, 'field') + alias, field = field, self._get_field_setting(field, 'field') + if self._get_field_setting(alias, 'deprecated'): + self.ydl.deprecation_warning( + f'Format sorting alias {alias} is deprecated ' + f'and may be removed in a future version. Please use {field} instead') reverse = match.group('reverse') is not None closest = match.group('separator') == '~' limit_text = match.group('limit') @@ -1777,10 +1926,7 @@ class InfoExtractor(object): def _sort_formats(self, formats, field_preference=[]): if not formats: return - format_sort = self.FormatSort() # params and to_screen are taken from the downloader - format_sort.evaluate_params(self._downloader.params, field_preference) - if self.get_param('verbose', False): - format_sort.print_verbose_info(self._downloader.write_debug) + format_sort = self.FormatSort(self, field_preference) formats.sort(key=lambda f: format_sort.calculate_preference(f)) def _check_formats(self, formats, video_id): @@ -1899,7 +2045,7 @@ class InfoExtractor(object): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1961,7 +2107,7 @@ class InfoExtractor(object): def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -2008,16 +2154,16 @@ class InfoExtractor(object): headers=headers, query=query, video_id=video_id) def _parse_m3u8_formats_and_subtitles( - self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8_native', + self, m3u8_doc, m3u8_url=None, ext=None, entry_protocol='m3u8_native', preference=None, quality=None, m3u8_id=None, live=False, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2056,9 +2202,9 @@ class InfoExtractor(object): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, - 'url': m3u8_url, + 'url': m3u8_url or encode_data_uri(m3u8_doc.encode('utf-8'), 'application/x-mpegurl'), 'ext': ext, 'protocol': entry_protocol, 'preference': preference, @@ -2105,7 +2251,7 @@ class InfoExtractor(object): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2162,9 +2308,9 @@ class InfoExtractor(object): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2264,7 +2410,7 @@ class InfoExtractor(object): if smil is False: assert not fatal - return [] + return [], {} namespace = self._parse_smil_namespace(smil) @@ -2628,7 +2774,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2644,11 +2790,15 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = representation_attrib.get('codecs', '') + codecs = parse_codecs(representation_attrib.get('codecs', '')) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs.split('.')[0] == 'stpp': + elif codecs['vcodec'] != 'none': + content_type = 'video' + elif codecs['acodec'] != 'none': + content_type = 'audio' + elif codecs.get('tcodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2694,10 +2844,8 @@ class InfoExtractor(object): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] + **codecs } - f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2770,7 +2918,8 @@ class InfoExtractor(object): segment_duration = None if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) - representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) + representation_ms_info['total_number'] = int(math.ceil( + float_or_none(period_duration, segment_duration, default=0))) representation_ms_info['fragments'] = [{ media_location_key: media_template % { 'Number': segment_number, @@ -2861,10 +3010,16 @@ class InfoExtractor(object): f['url'] = initialization_url f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) + if not period_duration: + period_duration = try_get( + representation_ms_info, + lambda r: sum(frag['duration'] for frag in r['fragments']), float) else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2953,13 +3108,6 @@ class InfoExtractor(object): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2978,7 +3126,7 @@ class InfoExtractor(object): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', @@ -3008,7 +3156,7 @@ class InfoExtractor(object): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): return urljoin(base_url, item_url) @@ -3402,15 +3550,11 @@ class InfoExtractor(object): return formats def _live_title(self, name): - """ Generate the title for a live video """ - now = datetime.datetime.now() - now_str = now.strftime('%Y-%m-%d %H:%M') - return name + ' ' + now_str + self._downloader.deprecation_warning('hypervideo_dl.InfoExtractor._live_title is deprecated and does not work as expected') + return name def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) - if 'get_attr' in kwargs: - print(getattr(v, kwargs['get_attr'])) if res is None: msg = 'Failed to extract %s: Could not parse value %r' % (name, v) if fatal: @@ -3515,14 +3659,18 @@ class InfoExtractor(object): def extractor(): comments = [] + interrupted = True try: while True: comments.append(next(generator)) - except KeyboardInterrupt: - interrupted = True - self.to_screen('Interrupted by user') except StopIteration: interrupted = False + except KeyboardInterrupt: + self.to_screen('Interrupted by user') + except Exception as e: + if self.get_param('ignoreerrors') is not True: + raise + self._downloader.report_error(e) comment_count = len(comments) self.to_screen(f'Extracted {comment_count} comments') return { @@ -3536,11 +3684,11 @@ class InfoExtractor(object): @staticmethod def _merge_subtitle_items(subtitle_list1, subtitle_list2): - """ Merge subtitle items for one language. Items with duplicated URLs + """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_urls = set([item['url'] for item in subtitle_list1]) + list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) ret = list(subtitle_list1) - ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @classmethod @@ -3565,9 +3713,8 @@ class InfoExtractor(object): def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self._get_login_info()[0] is not None - or self.get_param('cookiefile') - or self.get_param('cookiesfrombrowser')): + if (self.supports_login() and self._get_login_info()[0] is not None + or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3600,7 +3747,7 @@ class InfoExtractor(object): else 'public' if all_known else None) - def _configuration_arg(self, key, default=NO_DEFAULT, casesense=False): + def _configuration_arg(self, key, default=NO_DEFAULT, *, ie_key=None, casesense=False): ''' @returns A list of values for the extractor argument given by "key" or "default" if no such key is present @@ -3608,34 +3755,43 @@ class InfoExtractor(object): @param casesense When false, the values are converted to lower case ''' val = traverse_obj( - self._downloader.params, ('extractor_args', self.ie_key().lower(), key)) + self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] + def _yes_playlist(self, playlist_id, video_id, smuggled_data=None, *, playlist_label='playlist', video_label='video'): + if not playlist_id or not video_id: + return not video_id + + no_playlist = (smuggled_data or {}).get('force_noplaylist') + if no_playlist is not None: + return not no_playlist + + video_id = '' if video_id is True else f' {video_id}' + playlist_id = '' if playlist_id is True else f' {playlist_id}' + if self.get_param('noplaylist'): + self.to_screen(f'Downloading just the {video_label}{video_id} because of --no-playlist') + return False + self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') + return True + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. + Instances should define _SEARCH_KEY and optionally _MAX_RESULTS """ + _MAX_RESULTS = float('inf') + @classmethod def _make_valid_url(cls): return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError('Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') + prefix, query = self._match_valid_url(query).group('prefix', 'query') if prefix == '': return self._get_n_results(query, 1) elif prefix == 'all': diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 352951e..1194613 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index eba6b73..e90aa19 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -57,7 +57,7 @@ class CoubIE(InfoExtractor): file_versions = coub['file_versions'] - QUALITIES = ('low', 'med', 'high') + QUALITIES = ('low', 'med', 'high', 'higher') MOBILE = 'mobile' IPHONE = 'iphone' @@ -86,6 +86,7 @@ class CoubIE(InfoExtractor): 'format_id': '%s-%s-%s' % (HTML5, kind, quality), 'filesize': int_or_none(item.get('size')), 'vcodec': 'none' if kind == 'audio' else None, + 'acodec': 'none' if kind == 'video' else None, 'quality': quality_key(quality), 'source_preference': preference_key(HTML5), }) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py new file mode 100644 index 0000000..d49f1ca --- /dev/null +++ b/hypervideo_dl/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cozy\.tv/(?P[^/]+)/replays/(?P[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py new file mode 100644 index 0000000..2274115 --- /dev/null +++ b/hypervideo_dl/extractor/cpac.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + str_or_none, + try_get, + unified_timestamp, + update_url_query, + urljoin, +) + +# compat_range +try: + if callable(xrange): + range = xrange +except (NameError, TypeError): + pass + + +class CPACIE(InfoExtractor): + IE_NAME = 'cpac' + _VALID_URL = r'https?://(?:www\.)?cpac\.ca/(?Pl-)?episode\?id=(?P[\da-f]{8}(?:-[\da-f]{4}){3}-[\da-f]{12})' + _TEST = { + # 'url': 'http://www.cpac.ca/en/programs/primetime-politics/episodes/65490909', + 'url': 'https://www.cpac.ca/episode?id=fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'md5': 'e46ad699caafd7aa6024279f2614e8fa', + 'info_dict': { + 'id': 'fc7edcae-4660-47e1-ba61-5b7f29a9db0f', + 'ext': 'mp4', + 'upload_date': '20220215', + 'title': 'News Conference to Celebrate National Kindness Week – February 15, 2022', + 'description': 'md5:466a206abd21f3a6f776cdef290c23fb', + 'timestamp': 1644901200, + }, + 'params': { + 'format': 'bestvideo', + 'hls_prefer_native': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if '/l-episode?' in url else 'en' + + content = self._download_json( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/episode/index.xml&crafterSite=cpacca&id=' + video_id, + video_id) + video_url = try_get(content, lambda x: x['page']['details']['videoUrl'], compat_str) + formats = [] + if video_url: + content = content['page'] + title = str_or_none(content['details']['title_%s_t' % (url_lang, )]) + formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', ext='mp4') + for fmt in formats: + # prefer language to match URL + fmt_lang = fmt.get('language') + if fmt_lang == url_lang: + fmt['language_preference'] = 10 + elif not fmt_lang: + fmt['language_preference'] = -1 + else: + fmt['language_preference'] = -10 + + self._sort_formats(formats) + + category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) + + def is_live(v_type): + return (v_type == 'live') if v_type is not None else None + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': str_or_none(content['details'].get('description_%s_t' % (url_lang, ))), + 'timestamp': unified_timestamp(content['details'].get('liveDateTime')), + 'category': [category] if category else None, + 'thumbnail': urljoin(url, str_or_none(content['details'].get('image_%s_s' % (url_lang, )))), + 'is_live': is_live(content['details'].get('type')), + } + + +class CPACPlaylistIE(InfoExtractor): + IE_NAME = 'cpac:playlist' + _VALID_URL = r'(?i)https?://(?:www\.)?cpac\.ca/(?:program|search|(?Pemission|rechercher))\?(?:[^&]+&)*?(?P(?:id=\d+|programId=\d+|key=[^&]+))' + + _TESTS = [{ + 'url': 'https://www.cpac.ca/program?id=6', + 'info_dict': { + 'id': 'id=6', + 'title': 'Headline Politics', + 'description': 'Watch CPAC’s signature long-form coverage of the day’s pressing political events as they unfold.', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.cpac.ca/search?key=hudson&type=all&order=desc', + 'info_dict': { + 'id': 'key=hudson', + 'title': 'hudson', + }, + 'playlist_count': 22, + }, { + 'url': 'https://www.cpac.ca/search?programId=50', + 'info_dict': { + 'id': 'programId=50', + 'title': '50', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.cpac.ca/emission?id=6', + 'only_matching': True, + }, { + 'url': 'https://www.cpac.ca/rechercher?key=hudson&type=all&order=desc', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + url_lang = 'fr' if any(x in url for x in ('/emission?', '/rechercher?')) else 'en' + pl_type, list_type = ('program', 'itemList') if any(x in url for x in ('/program?', '/emission?')) else ('search', 'searchResult') + api_url = ( + 'https://www.cpac.ca/api/1/services/contentModel.json?url=/site/website/%s/index.xml&crafterSite=cpacca&%s' + % (pl_type, video_id, )) + content = self._download_json(api_url, video_id) + entries = [] + total_pages = int_or_none(try_get(content, lambda x: x['page'][list_type]['totalPages']), default=1) + for page in range(1, total_pages + 1): + if page > 1: + api_url = update_url_query(api_url, {'page': '%d' % (page, ), }) + content = self._download_json( + api_url, video_id, + note='Downloading continuation - %d' % (page, ), + fatal=False) + + for item in try_get(content, lambda x: x['page'][list_type]['item'], list) or []: + episode_url = urljoin(url, try_get(item, lambda x: x['url_%s_s' % (url_lang, )])) + if episode_url: + entries.append(episode_url) + + return self.playlist_result( + (self.url_result(entry) for entry in entries), + playlist_id=video_id, + playlist_title=try_get(content, lambda x: x['page']['program']['title_%s_t' % (url_lang, )]) or video_id.split('=')[-1], + playlist_description=try_get(content, lambda x: x['page']['program']['description_%s_t' % (url_lang, )]), + ) diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index 2c9d28d..db4962c 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -23,32 +23,35 @@ from ..utils import ( class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', + # Crackle is available in the United States and territories + 'url': 'https://www.crackle.com/thanksgiving/2510064', 'info_dict': { - 'id': '2502343', + 'id': '2510064', 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, + 'title': 'Touch Football', + 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df', + 'duration': 1398, 'view_count': int, 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', + 'age_limit': 17, + 'genre': 'Comedy', + 'creator': 'Daniel Powell', + 'artist': 'Chris Elliott, Amy Sedaris', + 'release_year': 2016, + 'series': 'Thanksgiving', + 'episode': 'Touch Football', 'season_number': 1, 'episode_number': 1, }, 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': [ + 'Trying with a list of known countries' + ], }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', 'only_matching': True, }] @@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor): break ignore_no_formats = self.get_param('ignore_no_formats_error') - allow_unplayable_formats = self.get_param('allow_unplayable_formats') if not media or (not media.get('MediaURLs') and not ignore_no_formats): raise ExtractorError( @@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor): for e in media.get('MediaURLs') or []: if e.get('UseDRM'): has_drm = True - if not allow_unplayable_formats: - continue - format_url = url_or_none(e.get('Path')) + format_url = url_or_none(e.get('DRMPath')) + else: + format_url = url_or_none(e.get('Path')) if not format_url: continue ext = determine_ext(format_url) diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py new file mode 100644 index 0000000..ed2f442 --- /dev/null +++ b/hypervideo_dl/extractor/craftsy.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + dict_get, + get_element_by_id, + js_to_json, + traverse_obj, +) + + +class CraftsyIE(InfoExtractor): + _VALID_URL = r'https?://www.craftsy.com/class/(?P[a-z0-9_-]+)/' + _TESTS = [{ + 'url': 'https://www.craftsy.com/class/the-midnight-quilt-show-season-5/', + 'info_dict': { + 'id': 'the-midnight-quilt-show-season-5', + 'title': 'The Midnight Quilt Show Season 5', + 'description': 'md5:113eda818e985d1a566625fb2f833b7a', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.craftsy.com/class/sew-your-own-designer-handbag/', + 'info_dict': { + 'id': 'sew-your-own-designer-handbag', + 'title': 'Sew Your Own Designer Handbag', + 'description': 'md5:8270d0ef5427d3c895a27351aeaac276', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.craftsy.com/class/all-access-estes-park-wool-market/', + 'info_dict': { + 'id': 'all-access-estes-park-wool-market', + 'title': 'All Access: Estes Park Wool Market', + 'description': 'md5:aded1bd8d38ae2fae4dae936c0ae01e7', + }, + 'playlist_count': 6, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'class_video_player_vars\s*=\s*({.*})\s*;', + get_element_by_id('vidstore-classes_class-video-player-js-extra', webpage), + 'video data'), video_id, transform_source=js_to_json) + + account_id = traverse_obj(video_data, ('video_player', 'bc_account_id')) + + entries = [] + class_preview = traverse_obj(video_data, ('video_player', 'class_preview')) + if class_preview: + v_id = class_preview.get('video_id') + entries.append(self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={v_id}', + BrightcoveNewIE, v_id, class_preview.get('title'))) + + if dict_get(video_data, ('is_free', 'user_has_access')): + entries += [ + self.url_result( + f'http://players.brightcove.net/{account_id}/default_default/index.html?videoId={lesson["video_id"]}', + BrightcoveNewIE, lesson['video_id'], lesson.get('title')) + for lesson in video_data['lessons']] + + return self.playlist_result( + entries, video_id, video_data.get('class_title'), + self._html_search_meta(('og:description', 'description'), webpage, default=None)) diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py new file mode 100644 index 0000000..72906af --- /dev/null +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -0,0 +1,113 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_strdate, +) + + +class CrowdBunkerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/v/(?P[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/v/0z4Kms8pi8I', + 'info_dict': { + 'id': '0z4Kms8pi8I', + 'ext': 'mp4', + 'title': '117) Pass vax et solutions', + 'description': 'md5:86bcb422c29475dbd2b5dcfa6ec3749c', + 'view_count': int, + 'duration': 5386, + 'uploader': 'Jérémie Mercier', + 'uploader_id': 'UCeN_qQV829NYf0pvPJhW5dQ', + 'like_count': int, + 'upload_date': '20211218', + 'thumbnail': 'https://scw.divulg.org/cb-medias4/images/0z4Kms8pi8I/maxres.jpg' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://api.divulg.org/post/{id}/details', + id, headers={'accept': 'application/json, text/plain, */*'}) + video_json = data_json['video'] + formats, subtitles = [], {} + for sub in video_json.get('captions') or []: + sub_url = try_get(sub, lambda x: x['file']['url']) + if not sub_url: + continue + subtitles.setdefault(sub.get('languageCode', 'fr'), []).append({ + 'url': sub_url, + }) + + mpd_url = try_get(video_json, lambda x: x['dashManifest']['url']) + if mpd_url: + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + m3u8_url = try_get(video_json, lambda x: x['hlsManifest']['url']) + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles(mpd_url, id) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + thumbnails = [{ + 'url': image['url'], + 'height': int_or_none(image.get('height')), + 'width': int_or_none(image.get('width')), + } for image in video_json.get('thumbnails') or [] if image.get('url')] + + self._sort_formats(formats) + return { + 'id': id, + 'title': video_json.get('title'), + 'description': video_json.get('description'), + 'view_count': video_json.get('viewCount'), + 'duration': video_json.get('duration'), + 'uploader': try_get(data_json, lambda x: x['channel']['name']), + 'uploader_id': try_get(data_json, lambda x: x['channel']['id']), + 'like_count': data_json.get('likesCount'), + 'upload_date': unified_strdate(video_json.get('publishedAt') or video_json.get('createdAt')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + +class CrowdBunkerChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?crowdbunker\.com/@(?P[^/?#$&]+)' + + _TESTS = [{ + 'url': 'https://crowdbunker.com/@Milan_UHRIN', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'Milan_UHRIN', + }, + }] + + def _entries(self, id): + last = None + + for page in itertools.count(): + channel_json = self._download_json( + f'https://api.divulg.org/organization/{id}/posts', id, headers={'accept': 'application/json, text/plain, */*'}, + query={'after': last} if last else {}, note=f'Downloading Page {page}') + for item in channel_json.get('items') or []: + v_id = item.get('uid') + if not v_id: + continue + yield self.url_result( + 'https://crowdbunker.com/v/%s' % v_id, ie=CrowdBunkerIE.ie_key(), video_id=v_id) + last = channel_json.get('last') + if not last: + break + + def _real_extract(self, url): + id = self._match_id(url) + return self.playlist_result(self._entries(id), playlist_id=id) diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 511ac1b..7edb645 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re import json import zlib @@ -8,7 +9,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVIE +from .vrv import VRVBaseIE from ..compat import ( compat_b64decode, compat_etree_Element, @@ -23,14 +24,17 @@ from ..utils import ( bytes_to_intlist, extract_attributes, float_or_none, + format_field, intlist_to_bytes, int_or_none, + join_nonempty, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, + traverse_obj, try_get, - urlencode_postdata, xpath_text, ) from ..aes import ( @@ -39,8 +43,8 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.crunchyroll.com/login' - _LOGIN_FORM = 'login_form' + _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' + _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' def _call_rpc_api(self, method, video_id, note=None, data=None): @@ -53,57 +57,50 @@ class CrunchyrollBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get('etp_rt'): return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - def is_logged(webpage): - return 'href="/logout"' in webpage - - # Already logged in - if is_logged(login_page): - return - - login_form_str = self._search_regex( - r'(?P
    ]+?id=(["\'])%s\2[^>]*>)' % self._LOGIN_FORM, - login_page, 'login form', group='form') - - post_url = extract_attributes(login_form_str).get('action') - if not post_url: - post_url = self._LOGIN_URL - elif not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - login_form = self._form_hidden_inputs(self._LOGIN_FORM, login_page) - - login_form.update({ - 'login_form[name]': username, - 'login_form[password]': password, - }) - - response = self._download_webpage( - post_url, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - # Successful login - if is_logged(response): - return - - error = self._html_search_regex( - '(?s)]+class=["\']messages["\'][^>]*>(.+?)', - response, 'error message', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - - raise ExtractorError('Unable to log in') - - def _real_initialize(self): - self._login() + upsell_response = self._download_json( + f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', + query={ + 'sess_id': 1, + 'device_id': 'whatvalueshouldbeforweb', + 'device_type': 'com.crunchyroll.static', + 'access_token': 'giKq5eY27ny3cqz', + 'referer': self._LOGIN_URL + }) + if upsell_response['code'] != 'ok': + raise ExtractorError('Could not get session id') + session_id = upsell_response['data']['session_id'] + + login_response = self._download_json( + f'{self._API_BASE}/login.1.json', None, 'Logging in', + data=compat_urllib_parse_urlencode({ + 'account': username, + 'password': password, + 'session_id': session_id + }).encode('ascii')) + if login_response['code'] != 'ok': + raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Login succeeded but did not set etp_rt cookie') + + # Beta-specific, but needed for redirects + def _get_beta_embedded_json(self, webpage, display_id): + initial_state = self._parse_json(self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) + app_config = self._parse_json(self._search_regex( + r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) + return initial_state, app_config + + def _redirect_to_beta(self, webpage, iekey, video_id): + if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): + raise ExtractorError('Received a beta page from non-beta url when not logged in.') + initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) + url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] + self.to_screen(f'{video_id}: Redirected to beta site - {url}') + return self.url_result(f'{url}', iekey, video_id) @staticmethod def _add_skip_wall(url): @@ -119,7 +116,7 @@ class CrunchyrollBaseIE(InfoExtractor): parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) -class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): +class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): IE_NAME = 'crunchyroll' _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ @@ -425,6 +422,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage = self._download_webpage( self._add_skip_wall(webpage_url), video_id, headers=self.geo_verification_headers()) + if re.search(r'
    ', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) note_m = self._html_search_regex( r'
    (.+?)
    ', webpage, 'trailer-notice', default='') @@ -478,19 +477,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r']+href="/publisher/[^"]+"[^>]*>([^<]+)
    ', r'
    \s*Publisher:\s*\s*(.+?)\s*\s*
    '], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] @@ -684,6 +688,8 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): # https:// gives a 403, but http:// does not self._add_skip_wall(url).replace('https://', 'http://'), show_id, headers=self.geo_verification_headers()) + if re.search(r'
    ', webpage): + return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) title = self._html_search_meta('name', webpage, default=None) episode_re = r'
  • ]+>.*?(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)watch/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { @@ -719,26 +772,129 @@ class CrunchyrollBetaIE(CrunchyrollBaseIE): 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', + 'episode_number': 73, + 'series': 'World Trigger', + 'average_rating': 4.9, + 'episode': 'To the Future', + 'season': 'World Trigger', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_number': 1, + }, + 'params': {'skip_download': 'm3u8'}, + 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'info_dict': { + 'id': '648781', + 'ext': 'mp4', + 'episode_number': 1, + 'timestamp': 1389173400, + 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'description': 'md5:5579d1a0355cc618558ba23d27067a62', + 'uploader': 'TBS', + 'episode': 'Wicked Lord Shingan... Reborn', + 'average_rating': 4.9, + 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', + 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', + 'season_number': 2, + 'upload_date': '20140108', }, 'params': {'skip_download': 'm3u8'}, 'expected_warnings': ['Unable to download XML'] + }, { + 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'only_matching': True, }] def _real_extract(self, url): - lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'internal_id', 'id') - webpage = self._download_webpage(url, display_id) - episode_data = self._parse_json( - self._search_regex(r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'episode data'), - display_id)['content']['byId'][internal_id] - video_id = episode_data['external_id'].split('.')[1] - series_id = episode_data['episode_metadata']['series_slug_title'] - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id}/{display_id}-{video_id}', - CrunchyrollIE.ie_key(), video_id) - - -class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + episode_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, + note='Retrieving episode metadata', + query=params) + if episode_response.get('is_premium_only') and not episode_response.get('playback'): + raise ExtractorError('This video is for premium members only.', expected=True) + stream_response = self._download_json( + episode_response['playback'], display_id, + note='Retrieving stream info') + + thumbnails = [] + for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): + for thumbnail_data in thumbnails_data: + thumbnails.append({ + 'url': thumbnail_data.get('source'), + 'width': thumbnail_data.get('width'), + 'height': thumbnail_data.get('height'), + }) + subtitles = {} + for lang, subtitle_data in stream_response.get('subtitles').items(): + subtitles[lang] = [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] + + requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] + hardsub_preference = qualities(requested_hardsubs[::-1]) + requested_formats = self._configuration_arg('format') or ['adaptive_hls'] + + formats = [] + for stream_type, streams in stream_response.get('streams', {}).items(): + if stream_type not in requested_formats: + continue + for stream in streams.values(): + hardsub_lang = stream.get('hardsub_locale') or '' + if hardsub_lang.lower() not in requested_hardsubs: + continue + format_id = join_nonempty( + stream_type, + format_field(stream, 'hardsub_locale', 'hardsub-%s')) + if not stream.get('url'): + continue + if stream_type.split('_')[-1] == 'hls': + adaptive_formats = self._extract_m3u8_formats( + stream['url'], display_id, 'mp4', m3u8_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + elif stream_type.split('_')[-1] == 'dash': + adaptive_formats = self._extract_mpd_formats( + stream['url'], display_id, mpd_id=format_id, + note='Downloading %s information' % format_id, + fatal=False) + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) + self._sort_formats(formats) + + return { + 'id': internal_id, + 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'duration': float_or_none(episode_response.get('duration_ms'), 1000), + 'thumbnails': thumbnails, + 'series': episode_response.get('series_title'), + 'series_id': episode_response.get('series_id'), + 'season': episode_response.get('season_title'), + 'season_id': episode_response.get('season_id'), + 'season_number': episode_response.get('season_number'), + 'episode': episode_response.get('title'), + 'episode_number': episode_response.get('sequence_number'), + 'subtitles': subtitles, + 'formats': formats + } + + +class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/\w+/(?P[\w\-]+)/?(?:\?|$)' + _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P(?:\w{1,2}/)?)series/(?P\w+)/(?P[\w\-]*)/?(?:\?|$)' _TESTS = [{ 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { @@ -746,12 +902,57 @@ class CrunchyrollBetaShowIE(CrunchyrollBaseIE): 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, + }, { + 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', + 'info_dict': { + 'id': 'love-chunibyo-other-delusions-heart-throb-', + 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', + }, + 'playlist_mincount': 10, }, { 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', 'only_matching': True, }] def _real_extract(self, url): - lang, series_id = self._match_valid_url(url).group('lang', 'id') - return self.url_result(f'https://www.crunchyroll.com/{lang}{series_id.lower()}', - CrunchyrollShowPlaylistIE.ie_key(), series_id) + lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') + + if not self._get_cookies(url).get('etp_rt'): + return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) + + api_domain, bucket, params = self._get_params(lang) + + series_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/series/{internal_id}', display_id, + note='Retrieving series metadata', query=params) + + seasons_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/seasons?series_id={internal_id}', display_id, + note='Retrieving season list', query=params) + + def entries(): + for season in seasons_response['items']: + episodes_response = self._download_json( + f'{api_domain}/cms/v2{bucket}/episodes?season_id={season["id"]}', display_id, + note=f'Retrieving episode list for {season.get("slug_title")}', query=params) + for episode in episodes_response['items']: + episode_id = episode['id'] + episode_display_id = episode['slug_title'] + yield { + '_type': 'url', + 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'ie_key': CrunchyrollBetaIE.ie_key(), + 'id': episode_id, + 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), + 'description': try_get(episode, lambda x: x['description'].replace(r'\r\n', '\n')), + 'duration': float_or_none(episode.get('duration_ms'), 1000), + 'series': episode.get('series_title'), + 'series_id': episode.get('series_id'), + 'season': episode.get('season_title'), + 'season_id': episode.get('season_id'), + 'season_number': episode.get('season_number'), + 'episode': episode.get('title'), + 'episode_number': episode.get('sequence_number') + } + + return self.playlist_result(entries(), internal_id, series_response.get('title')) diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index 2e01aff..f51159b 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTMLParseError from ..utils import ( determine_ext, ExtractorError, @@ -11,14 +12,16 @@ from ..utils import ( get_element_by_attribute, get_element_by_class, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_iso8601, + parse_qs, smuggle_url, str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE @@ -126,8 +129,12 @@ class CSpanIE(InfoExtractor): ext = 'vtt' subtitle['ext'] = ext ld_info = self._search_json_ld(webpage, video_id, default={}) - title = get_element_by_class('video-page-title', webpage) or \ - self._og_search_title(webpage) + try: + title = get_element_by_class('video-page-title', webpage) + except compat_HTMLParseError: + title = None + if title is None: + title = self._og_search_title(webpage) description = get_element_by_attribute('itemprop', 'description', webpage) or \ self._html_search_meta(['og:description', 'description'], webpage) return merge_dicts(info, ld_info, { @@ -242,3 +249,42 @@ class CSpanIE(InfoExtractor): 'title': title, 'id': 'c' + video_id if video_type == 'clip' else video_id, } + + +class CSpanCongressIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?c-span\.org/congress/' + _TESTS = [{ + 'url': 'https://www.c-span.org/congress/?chamber=house&date=2017-12-13&t=1513208380', + 'info_dict': { + 'id': 'house_2017-12-13', + 'title': 'Congressional Chronicle - Members of Congress, Hearings and More', + 'description': 'md5:54c264b7a8f219937987610243305a84', + 'thumbnail': r're:https://ximage.c-spanvideo.org/.+', + 'ext': 'mp4' + } + }] + + def _real_extract(self, url): + query = parse_qs(url) + video_date = query.get('date', [None])[0] + video_id = join_nonempty(query.get('chamber', ['senate'])[0], video_date, delim='_') + webpage = self._download_webpage(url, video_id) + if not video_date: + jwp_date = re.search(r'jwsetup.clipprogdate = \'(?P\d{4}-\d{2}-\d{2})\';', webpage) + if jwp_date: + video_id = f'{video_id}_{jwp_date.group("date")}' + jwplayer_data = self._parse_json( + self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), + video_id, transform_source=js_to_json) + + title = (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, 'video title')) + description = (self._og_search_description(webpage, default=None) + or self._html_search_meta('description', webpage, 'description', default=None)) + + return { + **self._parse_jwplayer_data(jwplayer_data, video_id, False), + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': description, + 'http_headers': {'Referer': 'https://www.c-span.org/'}, + } diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 03f8cef..952f4c7 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -65,4 +65,9 @@ class CTVNewsIE(InfoExtractor): }) entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + if not entries: + webpage = self._download_webpage(url, page_id) + if 'getAuthStates("' in webpage: + entries = [ninecninemedia_url_result(clip_id) for clip_id in + self._search_regex(r'getAuthStates\("([\d+,]+)"', webpage, 'clip ids').split(',')] return self.playlist_result(entries, page_id) diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index 034a5c9..b8abcf7 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -15,7 +15,6 @@ from ..utils import ( class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -34,43 +33,46 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) return result['data'] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ - 'email': email, + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ + 'email': username, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +142,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +179,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +197,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py new file mode 100644 index 0000000..c278f0f --- /dev/null +++ b/hypervideo_dl/extractor/cybrary.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + smuggle_url, + str_or_none, + traverse_obj, + urlencode_postdata +) + + +class CybraryBaseIE(InfoExtractor): + _API_KEY = 'AIzaSyCX9ru6j70PX2My1Eq6Q1zoMAhuTdXlzSw' + _ENDPOINTS = { + 'course': 'https://app.cybrary.it/courses/api/catalog/browse/course/{}', + 'course_enrollment': 'https://app.cybrary.it/courses/api/catalog/{}/enrollment', + 'enrollment': 'https://app.cybrary.it/courses/api/enrollment/{}', + 'launch': 'https://app.cybrary.it/courses/api/catalog/{}/launch', + 'vimeo_oembed': 'https://vimeo.com/api/oembed.json?url=https://vimeo.com/{}', + } + _NETRC_MACHINE = 'cybrary' + _TOKEN = None + + def _perform_login(self, username, password): + CybraryBaseIE._TOKEN = self._download_json( + f'https://identitytoolkit.googleapis.com/v1/accounts:signInWithPassword?key={self._API_KEY}', + None, data=urlencode_postdata({'email': username, 'password': password, 'returnSecureToken': True}), + note='Logging in')['idToken'] + + def _real_initialize(self): + if not self._TOKEN: + self.raise_login_required(method='password') + + def _call_api(self, endpoint, item_id): + return self._download_json( + self._ENDPOINTS[endpoint].format(item_id), item_id, + note=f'Downloading {endpoint} JSON metadata', + headers={'Authorization': f'Bearer {self._TOKEN}'}) + + def _get_vimeo_id(self, activity_id): + launch_api = self._call_api('launch', activity_id) + + if launch_api.get('url'): + return self._search_regex(r'https?://player\.vimeo\.com/video/(?P[0-9]+)', launch_api['url'], 'vimeo_id') + return traverse_obj(launch_api, ('vendor_data', 'content', ..., 'videoId'), get_all=False) + + +class CybraryIE(CybraryBaseIE): + _VALID_URL = r'https?://app.cybrary.it/immersive/(?P[0-9]+)/activity/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://app.cybrary.it/immersive/12487950/activity/63102', + 'md5': '9ae12d37e555cb2ed554223a71a701d0', + 'info_dict': { + 'id': '646609770', + 'ext': 'mp4', + 'title': 'Getting Started', + 'thumbnail': 'https://i.vimeocdn.com/video/1301817996-76a268f0c56cff18a5cecbbdc44131eb9dda0c80eb0b3a036_1280', + 'series_id': '63111', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 88, + 'uploader_id': 'user30867300', + 'series': 'Cybrary Orientation', + 'uploader': 'Cybrary', + 'chapter': 'Cybrary Orientation Series', + 'chapter_id': '63110' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }, { + 'url': 'https://app.cybrary.it/immersive/12747143/activity/52686', + 'md5': '62f26547dccc59c44363e2a13d4ad08d', + 'info_dict': { + 'id': '445638073', + 'ext': 'mp4', + 'title': 'Azure Virtual Network IP Addressing', + 'thumbnail': 'https://i.vimeocdn.com/video/936667051-1647ace66c627d4a2382185e0dae8deb830309bfddd53f8b2367b2f91e92ed0e-d_1280', + 'series_id': '52733', + 'uploader_url': 'https://vimeo.com/user30867300', + 'duration': 426, + 'uploader_id': 'user30867300', + 'series': 'AZ-500: Microsoft Azure Security Technologies', + 'uploader': 'Cybrary', + 'chapter': 'Implement Network Security', + 'chapter_id': '52693' + }, + 'expected_warnings': ['No authenticators for vimeo'] + }] + + def _real_extract(self, url): + activity_id, enrollment_id = self._match_valid_url(url).group('id', 'enrollment') + course = self._call_api('enrollment', enrollment_id)['content'] + activity = traverse_obj(course, ('learning_modules', ..., 'activities', lambda _, v: int(activity_id) == v['id']), get_all=False) + + if activity.get('type') not in ['Video Activity', 'Lesson Activity']: + raise ExtractorError('The activity is not a video', expected=True) + + module = next((m for m in course.get('learning_modules') or [] + if int(activity_id) in traverse_obj(m, ('activities', ..., 'id') or [])), None) + + vimeo_id = self._get_vimeo_id(activity_id) + + return { + '_type': 'url_transparent', + 'series': traverse_obj(course, ('content_description', 'title')), + 'series_id': str_or_none(traverse_obj(course, ('content_description', 'id'))), + 'id': vimeo_id, + 'chapter': module.get('title'), + 'chapter_id': str_or_none(module.get('id')), + 'title': activity.get('title'), + 'url': smuggle_url(f'https://player.vimeo.com/video/{vimeo_id}', {'http_headers': {'Referer': 'https://api.cybrary.it'}}) + } + + +class CybraryCourseIE(CybraryBaseIE): + _VALID_URL = r'https://app.cybrary.it/browse/course/(?P[\w-]+)/?(?:$|[#?])' + _TESTS = [{ + 'url': 'https://app.cybrary.it/browse/course/az-500-microsoft-azure-security-technologies', + 'info_dict': { + 'id': 898, + 'title': 'AZ-500: Microsoft Azure Security Technologies', + 'description': 'md5:69549d379c0fc1dec92926d4e8b6fbd4' + }, + 'playlist_count': 59 + }, { + 'url': 'https://app.cybrary.it/browse/course/cybrary-orientation', + 'info_dict': { + 'id': 1245, + 'title': 'Cybrary Orientation', + 'description': 'md5:9e69ff66b32fe78744e0ad4babe2e88e' + }, + 'playlist_count': 4 + }] + + def _real_extract(self, url): + course_id = self._match_id(url) + course = self._call_api('course', course_id) + enrollment_info = self._call_api('course_enrollment', course['id']) + + entries = [self.url_result( + f'https://app.cybrary.it/immersive/{enrollment_info["id"]}/activity/{activity["id"]}') + for activity in traverse_obj(course, ('content_item', 'learning_modules', ..., 'activities', ...))] + + return self.playlist_result( + entries, + traverse_obj(course, ('content_item', 'id'), expected_type=str_or_none), + course.get('title'), course.get('short_description')) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py new file mode 100644 index 0000000..6037fd9 --- /dev/null +++ b/hypervideo_dl/extractor/daftsex.py @@ -0,0 +1,146 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_b64decode +from ..utils import ( + int_or_none, + js_to_json, + parse_count, + parse_duration, + traverse_obj, + try_get, + unified_timestamp, +) + + +class DaftsexIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P-?\d+_\d+)' + _TESTS = [{ + 'url': 'https://daftsex.com/watch/-35370899_456246186', + 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'info_dict': { + 'id': '-35370899_456246186', + 'ext': 'mp4', + 'title': 'just relaxing', + 'description': 'just relaxing - Watch video Watch video in high quality', + 'upload_date': '20201113', + 'timestamp': 1605261911, + 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + }, + }, { + 'url': 'https://daftsex.com/watch/-156601359_456242791', + 'info_dict': { + 'id': '-156601359_456242791', + 'ext': 'mp4', + 'title': 'Skye Blue - Dinner And A Show', + 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', + 'upload_date': '20200916', + 'timestamp': 1600250735, + 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('name', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + + duration = parse_duration(self._search_regex( + r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', + webpage, 'duration', fatal=False)) + views = parse_count(self._search_regex( + r'Views: ([0-9 ]+)', + webpage, 'views', fatal=False)) + + player_hash = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*hash:\s*"([0-9a-zA-Z_\-]+)"[\s\S]*}', + webpage, 'player hash') + player_color = self._search_regex( + r'DaxabPlayer\.Init\({[\s\S]*color:\s*"([0-9a-z]+)"[\s\S]*}', + webpage, 'player color', fatal=False) or '' + + embed_page = self._download_webpage( + 'https://daxab.com/player/%s?color=%s' % (player_hash, player_color), + video_id, headers={'Referer': url}) + video_params = self._parse_json( + self._search_regex( + r'window\.globParams\s*=\s*({[\S\s]+})\s*;\s*<\/script>', + embed_page, 'video parameters'), + video_id, transform_source=js_to_json) + + server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') + + cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} + if cdn_files: + formats = [] + for format_id, format_data in cdn_files.items(): + ext, height = format_id.split('_') + formats.append({ + 'format_id': format_id, + 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration, + 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } + + item = self._download_json( + f'{server_domain}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': video_params['video']['access_token'], + 'videos': video_id, + 'ckey': video_params['c_key'], + 'credentials': video_params['video']['credentials'], + })['response']['items'][0] + + formats = [] + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index e04e10b..9cb5618 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -94,10 +94,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)| + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player\.html\?)?video|swf)| (?:www\.)?lequipe\.fr/video ) - /(?P[^/?_]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? + [/=](?P[^/?_&]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? ''' IE_NAME = 'dailymotion' _TESTS = [{ @@ -115,6 +115,25 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader_id': 'x1xm8ri', 'age_limit': 0, }, + }, { + 'url': 'https://geo.dailymotion.com/player.html?video=x89eyek&mute=true', + 'md5': 'e2f9717c6604773f963f069ca53a07f8', + 'info_dict': { + 'id': 'x89eyek', + 'ext': 'mp4', + 'title': "En quête d'esprit du 27/03/2022", + 'description': 'md5:66542b9f4df2eb23f314fc097488e553', + 'duration': 2756, + 'timestamp': 1648383669, + 'upload_date': '20220327', + 'uploader': 'CNEWS', + 'uploader_id': 'x24vth', + 'age_limit': 0, + 'view_count': int, + 'like_count': int, + 'tags': ['en_quete_d_esprit'], + 'thumbnail': 'https://s2.dmcdn.net/v/Tncwi1YGKdvFbDuDY/x1080', + } }, { 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', 'md5': '2137c41a8e78554bb09225b8eb322406', @@ -207,12 +226,10 @@ class DailymotionIE(DailymotionBaseInfoExtractor): video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: - if not self.get_param('noplaylist'): - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id) + if self._yes_playlist(playlist_id, video_id): return self.url_result( 'http://www.dailymotion.com/playlist/' + playlist_id, 'DailymotionPlaylist', playlist_id) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) password = self.get_param('videopassword') media = self._call_api( @@ -261,9 +278,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): continue if media_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', - 'm3u8' if is_live else 'm3u8_native', - m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) else: f = { 'url': media_url, @@ -305,7 +320,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): return { 'id': video_id, - 'title': self._live_title(title) if is_live else title, + 'title': title, 'description': clean_html(media.get('description')), 'thumbnails': thumbnails, 'duration': int_or_none(metadata.get('duration')) or None, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 8aa2af9..4362e92 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -157,11 +157,8 @@ class DaumListIE(InfoExtractor): query_dict = parse_qs(url) if 'clipid' in query_dict: clip_id = query_dict['clipid'][0] - if self.get_param('noplaylist'): - self.to_screen('Downloading just video %s because of --no-playlist' % clip_id) + if not self._yes_playlist(list_id, clip_id): return self.url_result(DaumClipIE._URL_TEMPLATE % clip_id, 'DaumClip') - else: - self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % list_id) class DaumPlaylistIE(DaumListIE): diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py new file mode 100644 index 0000000..4f59d90 --- /dev/null +++ b/hypervideo_dl/extractor/daystar.py @@ -0,0 +1,48 @@ +from .common import InfoExtractor +from ..utils import js_to_json, urljoin + + +class DaystarClipIE(InfoExtractor): + IE_NAME = 'daystar:clip' + _VALID_URL = r'https?://player\.daystar\.tv/(?P\w+)' + _TESTS = [{ + 'url': 'https://player.daystar.tv/0MTO2ITM', + 'info_dict': { + 'id': '0MTO2ITM', + 'ext': 'mp4', + 'title': 'The Dark World of COVID Pt. 1 | Aaron Siri', + 'description': 'a420d320dda734e5f29458df3606c5f4', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + src_iframe = self._search_regex(r'\]+src="([^"]+)"', webpage, 'src iframe') + webpage_iframe = self._download_webpage( + src_iframe.replace('player.php', 'config2.php'), video_id, headers={'Referer': src_iframe}) + + sources = self._parse_json(self._search_regex( + r'sources\:\s*(\[.*?\])', webpage_iframe, 'm3u8 source'), video_id, transform_source=js_to_json) + + formats, subtitles = [], {} + for source in sources: + file = source.get('file') + if file and source.get('type') == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + urljoin('https://www.lightcast.com/embed/', file), + video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._search_regex(r'image:\s*"([^"]+)', webpage_iframe, 'thumbnail'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py new file mode 100644 index 0000000..8398ae3 --- /dev/null +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -0,0 +1,141 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + parse_resolution, + traverse_obj, + try_get, + urlencode_postdata, +) + + +class DigitalConcertHallIE(InfoExtractor): + IE_DESC = 'DigitalConcertHall extractor' + _VALID_URL = r'https?://(?:www\.)?digitalconcerthall\.com/(?P[a-z]+)/concert/(?P[0-9]+)' + _OAUTH_URL = 'https://api.digitalconcerthall.com/v2/oauth2/token' + _ACCESS_TOKEN = None + _NETRC_MACHINE = 'digitalconcerthall' + _TESTS = [{ + 'note': 'Playlist with only one video', + 'url': 'https://www.digitalconcerthall.com/en/concert/53201', + 'info_dict': { + 'id': '53201-1', + 'ext': 'mp4', + 'composer': 'Kurt Weill', + 'title': '[Magic Night]', + 'thumbnail': r're:^https?://images.digitalconcerthall.com/cms/thumbnails.*\.jpg$', + 'upload_date': '20210624', + 'timestamp': 1624548600, + 'duration': 2798, + 'album_artist': 'Members of the Berliner Philharmoniker / Simon Rössler', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Concert with several works and an interview', + 'url': 'https://www.digitalconcerthall.com/en/concert/53785', + 'info_dict': { + 'id': '53785', + 'album_artist': 'Berliner Philharmoniker / Kirill Petrenko', + 'title': 'Kirill Petrenko conducts Mendelssohn and Shostakovich', + }, + 'params': {'skip_download': 'm3u8'}, + 'playlist_count': 3, + }] + + def _perform_login(self, username, password): + token_response = self._download_json( + self._OAUTH_URL, + None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ + 'affiliate': 'none', + 'grant_type': 'device', + 'device_vendor': 'unknown', + 'app_id': 'dch.webapp', + 'app_version': '1.0.0', + 'client_secret': '2ySLN+2Fwb', + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + self._ACCESS_TOKEN = token_response['access_token'] + try: + self._download_json( + self._OAUTH_URL, + None, note='Logging in', errnote='Unable to login', data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://www.digitalconcerthall.com', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}' + }) + except ExtractorError: + self.raise_login_required(msg='Login info incorrect') + + def _real_initialize(self): + if not self._ACCESS_TOKEN: + self.raise_login_required(method='password') + + def _entries(self, items, language, **kwargs): + for item in items: + video_id = item['id'] + stream_info = self._download_json( + self._proto_relative_url(item['_links']['streams']['href']), video_id, headers={ + 'Accept': 'application/json', + 'Authorization': f'Bearer {self._ACCESS_TOKEN}', + 'Accept-Language': language + }) + + m3u8_url = traverse_obj( + stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) + self._sort_formats(formats) + + yield { + 'id': video_id, + 'title': item.get('title'), + 'composer': item.get('name_composer'), + 'url': m3u8_url, + 'formats': formats, + 'duration': item.get('duration_total'), + 'timestamp': traverse_obj(item, ('date', 'published')), + 'description': item.get('short_description') or stream_info.get('short_description'), + **kwargs, + 'chapters': [{ + 'start_time': chapter.get('time'), + 'end_time': try_get(chapter, lambda x: x['time'] + x['duration']), + 'title': chapter.get('text'), + } for chapter in item['cuepoints']] if item.get('cuepoints') else None, + } + + def _real_extract(self, url): + language, video_id = self._match_valid_url(url).group('language', 'id') + if not language: + language = 'en' + + thumbnail_url = self._html_search_regex( + r'(https?://images\.digitalconcerthall\.com/cms/thumbnails/.*\.jpg)', + self._download_webpage(url, video_id), 'thumbnail') + thumbnails = [{ + 'url': thumbnail_url, + **parse_resolution(thumbnail_url) + }] + + vid_info = self._download_json( + f'https://api.digitalconcerthall.com/v2/concert/{video_id}', video_id, headers={ + 'Accept': 'application/json', + 'Accept-Language': language + }) + album_artist = ' / '.join(traverse_obj(vid_info, ('_links', 'artist', ..., 'name')) or '') + + return { + '_type': 'playlist', + 'id': video_id, + 'title': vid_info.get('title'), + 'entries': self._entries(traverse_obj(vid_info, ('_embedded', ..., ...)), language, + thumbnails=thumbnails, album_artist=album_artist), + 'thumbnails': thumbnails, + 'album_artist': album_artist, + } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index f018cbe..0ad7b1f 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index be7ad12..3d651f3 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -74,13 +74,11 @@ class DigitallySpeakingIE(InfoExtractor): tbr = int_or_none(bitrate) vbr = int_or_none(self._search_regex( r'-(\d+)\.mp4', video_path, 'vbr', default=None)) - abr = tbr - vbr if tbr and vbr else None video_formats.append({ 'format_id': bitrate, 'url': url, 'tbr': tbr, 'vbr': vbr, - 'abr': abr, }) return video_formats @@ -121,6 +119,7 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) + self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 90462c0..7410eb6 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -84,7 +84,7 @@ class DLiveStreamIE(InfoExtractor): self._sort_formats(formats) return { 'id': display_name, - 'title': self._live_title(title), + 'title': title, 'uploader': display_name, 'uploader_id': username, 'formats': formats, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py index 2c9ea68..f692127 100644 --- a/hypervideo_dl/extractor/doodstream.py +++ b/hypervideo_dl/extractor/doodstream.py @@ -20,6 +20,16 @@ class DoodStreamIE(InfoExtractor): 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', } + }, { + 'url': 'http://dood.watch/d/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } }, { 'url': 'https://dood.to/d/jzrxn12t2s7n', 'md5': '3207e199426eca7c2aa23c2872e6728a', @@ -34,31 +44,26 @@ class DoodStreamIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + url = f'https://dood.to/e/{video_id}' webpage = self._download_webpage(url, video_id) - if '/d/' in url: - url = "https://dood.to" + self._html_search_regex( - r'